* [PATCH net-next V2 4/7] liquidio CN23XX: VF scatter gather lists
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
To: davem
Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>
Adds support for VF scatter gather lists.
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 134 +++++++++++++++++++++
1 file changed, 134 insertions(+)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 0e23e2f..e4ee6ec 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -55,10 +55,28 @@ struct liquidio_if_cfg_resp {
u64 status;
};
+#define OCTNIC_MAX_SG (MAX_SKB_FRAGS)
+
#define OCTNIC_GSO_MAX_HEADER_SIZE 128
#define OCTNIC_GSO_MAX_SIZE \
(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
+struct octnic_gather {
+ /* List manipulation. Next and prev pointers. */
+ struct list_head list;
+
+ /* Size of the gather component at sg in bytes. */
+ int sg_size;
+
+ /* Number of bytes that sg was adjusted to make it 8B-aligned. */
+ int adjust;
+
+ /* Gather component that can accommodate max sized fragment list
+ * received from the IP layer.
+ */
+ struct octeon_sg_entry *sg;
+};
+
struct octeon_device_priv {
/* Tasklet structures for this device. */
struct tasklet_struct droq_tasklet;
@@ -237,6 +255,114 @@ static void start_txq(struct net_device *netdev)
}
/**
+ * Remove the node at the head of the list. The list would be empty at
+ * the end of this call if there are no more nodes in the list.
+ */
+static inline struct list_head *list_delete_head(struct list_head *root)
+{
+ struct list_head *node;
+
+ if ((root->prev == root) && (root->next == root))
+ node = NULL;
+ else
+ node = root->next;
+
+ if (node)
+ list_del(node);
+
+ return node;
+}
+
+/**
+ * \brief Delete gather lists
+ * @param lio per-network private data
+ */
+static void delete_glists(struct lio *lio)
+{
+ struct octnic_gather *g;
+ int i;
+
+ if (!lio->glist)
+ return;
+
+ for (i = 0; i < lio->linfo.num_txpciq; i++) {
+ do {
+ g = (struct octnic_gather *)
+ list_delete_head(&lio->glist[i]);
+ if (g) {
+ if (g->sg)
+ kfree((void *)((unsigned long)g->sg -
+ g->adjust));
+ kfree(g);
+ }
+ } while (g);
+ }
+
+ kfree(lio->glist);
+ kfree(lio->glist_lock);
+}
+
+/**
+ * \brief Setup gather lists
+ * @param lio per-network private data
+ */
+static int setup_glists(struct lio *lio, int num_iqs)
+{
+ struct octnic_gather *g;
+ int i, j;
+
+ lio->glist_lock =
+ kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
+ if (!lio->glist_lock)
+ return 1;
+
+ lio->glist =
+ kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
+ if (!lio->glist) {
+ kfree(lio->glist_lock);
+ return 1;
+ }
+
+ for (i = 0; i < num_iqs; i++) {
+ spin_lock_init(&lio->glist_lock[i]);
+
+ INIT_LIST_HEAD(&lio->glist[i]);
+
+ for (j = 0; j < lio->tx_qsize; j++) {
+ g = kzalloc(sizeof(*g), GFP_KERNEL);
+ if (!g)
+ break;
+
+ g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
+ OCT_SG_ENTRY_SIZE);
+
+ g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
+ if (!g->sg) {
+ kfree(g);
+ break;
+ }
+
+ /* The gather component should be aligned on 64-bit
+ * boundary
+ */
+ if (((unsigned long)g->sg) & 7) {
+ g->adjust = 8 - (((unsigned long)g->sg) & 7);
+ g->sg = (struct octeon_sg_entry *)
+ ((unsigned long)g->sg + g->adjust);
+ }
+ list_add_tail(&g->list, &lio->glist[i]);
+ }
+
+ if (j != lio->tx_qsize) {
+ delete_glists(lio);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/**
* \brief Print link information
* @param netdev network device
*/
@@ -681,6 +807,8 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
cleanup_link_status_change_wq(netdev);
+ delete_glists(lio);
+
free_netdev(netdev);
oct->props[ifidx].gmxport = -1;
@@ -1379,6 +1507,12 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
/* Copy MAC Address to OS network device structure */
ether_addr_copy(netdev->dev_addr, mac);
+ if (setup_glists(lio, num_iqueues)) {
+ dev_err(&octeon_dev->pci_dev->dev,
+ "Gather list allocation failed\n");
+ goto setup_nic_dev_fail;
+ }
+
if (netdev->features & NETIF_F_LRO)
liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next V2 6/7] liquidio CN23XX: VF TX buffers
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
To: davem
Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>
Adds support for freeing VF xmit buffers.
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 182 +++++++++++++++++++++
1 file changed, 182 insertions(+)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index cf80722..ce5cdcd 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -270,6 +270,19 @@ static void start_txq(struct net_device *netdev)
}
/**
+ * \brief Wake a queue
+ * @param netdev network device
+ * @param q which queue to wake
+ */
+static inline void wake_q(struct net_device *netdev, int q)
+{
+ if (netif_is_multiqueue(netdev))
+ netif_wake_subqueue(netdev, q);
+ else
+ netif_wake_queue(netdev);
+}
+
+/**
* \brief Stop a queue
* @param netdev network device
* @param q which queue to stop
@@ -920,6 +933,163 @@ static int octeon_pci_os_setup(struct octeon_device *oct)
return 0;
}
+static inline int skb_iq(struct lio *lio, struct sk_buff *skb)
+{
+ int q = 0;
+
+ if (netif_is_multiqueue(lio->netdev))
+ q = skb->queue_mapping % lio->linfo.num_txpciq;
+
+ return q;
+}
+
+/**
+ * \brief Check Tx queue state for a given network buffer
+ * @param lio per-network private data
+ * @param skb network buffer
+ */
+static inline int check_txq_state(struct lio *lio, struct sk_buff *skb)
+{
+ int q = 0, iq = 0;
+
+ if (netif_is_multiqueue(lio->netdev)) {
+ q = skb->queue_mapping;
+ iq = lio->linfo.txpciq[(q % (lio->linfo.num_txpciq))].s.q_no;
+ } else {
+ iq = lio->txq;
+ q = iq;
+ }
+
+ if (octnet_iq_is_full(lio->oct_dev, iq))
+ return 0;
+
+ if (__netif_subqueue_stopped(lio->netdev, q)) {
+ INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, iq, tx_restart, 1);
+ wake_q(lio->netdev, q);
+ }
+
+ return 1;
+}
+
+/**
+ * \brief Unmap and free network buffer
+ * @param buf buffer
+ */
+static void free_netbuf(void *buf)
+{
+ struct octnet_buf_free_info *finfo;
+ struct sk_buff *skb;
+ struct lio *lio;
+
+ finfo = (struct octnet_buf_free_info *)buf;
+ skb = finfo->skb;
+ lio = finfo->lio;
+
+ dma_unmap_single(&lio->oct_dev->pci_dev->dev, finfo->dptr, skb->len,
+ DMA_TO_DEVICE);
+
+ check_txq_state(lio, skb);
+
+ tx_buffer_free(skb);
+}
+
+/**
+ * \brief Unmap and free gather buffer
+ * @param buf buffer
+ */
+static void free_netsgbuf(void *buf)
+{
+ struct octnet_buf_free_info *finfo;
+ struct octnic_gather *g;
+ struct sk_buff *skb;
+ int i, frags, iq;
+ struct lio *lio;
+
+ finfo = (struct octnet_buf_free_info *)buf;
+ skb = finfo->skb;
+ lio = finfo->lio;
+ g = finfo->g;
+ frags = skb_shinfo(skb)->nr_frags;
+
+ dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+ g->sg[0].ptr[0], (skb->len - skb->data_len),
+ DMA_TO_DEVICE);
+
+ i = 1;
+ while (frags--) {
+ struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
+
+ pci_unmap_page((lio->oct_dev)->pci_dev,
+ g->sg[(i >> 2)].ptr[(i & 3)],
+ frag->size, DMA_TO_DEVICE);
+ i++;
+ }
+
+ dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+ finfo->dptr, g->sg_size,
+ DMA_TO_DEVICE);
+
+ iq = skb_iq(lio, skb);
+
+ spin_lock(&lio->glist_lock[iq]);
+ list_add_tail(&g->list, &lio->glist[iq]);
+ spin_unlock(&lio->glist_lock[iq]);
+
+ check_txq_state(lio, skb); /* mq support: sub-queue state check */
+
+ tx_buffer_free(skb);
+}
+
+/**
+ * \brief Unmap and free gather buffer with response
+ * @param buf buffer
+ */
+static void free_netsgbuf_with_resp(void *buf)
+{
+ struct octnet_buf_free_info *finfo;
+ struct octeon_soft_command *sc;
+ struct octnic_gather *g;
+ struct sk_buff *skb;
+ int i, frags, iq;
+ struct lio *lio;
+
+ sc = (struct octeon_soft_command *)buf;
+ skb = (struct sk_buff *)sc->callback_arg;
+ finfo = (struct octnet_buf_free_info *)&skb->cb;
+
+ lio = finfo->lio;
+ g = finfo->g;
+ frags = skb_shinfo(skb)->nr_frags;
+
+ dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+ g->sg[0].ptr[0], (skb->len - skb->data_len),
+ DMA_TO_DEVICE);
+
+ i = 1;
+ while (frags--) {
+ struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
+
+ pci_unmap_page((lio->oct_dev)->pci_dev,
+ g->sg[(i >> 2)].ptr[(i & 3)],
+ frag->size, DMA_TO_DEVICE);
+ i++;
+ }
+
+ dma_unmap_single(&lio->oct_dev->pci_dev->dev,
+ finfo->dptr, g->sg_size,
+ DMA_TO_DEVICE);
+
+ iq = skb_iq(lio, skb);
+
+ spin_lock(&lio->glist_lock[iq]);
+ list_add_tail(&g->list, &lio->glist[iq]);
+ spin_unlock(&lio->glist_lock[iq]);
+
+ /* Don't free the skb yet */
+
+ check_txq_state(lio, skb);
+}
+
/**
* \brief Callback for getting interface configuration
* @param status status of request
@@ -1675,6 +1845,18 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC, OPCODE_NIC_INFO,
lio_nic_info, octeon_dev);
+ /* REQTYPE_RESP_NET and REQTYPE_SOFT_COMMAND do not have free functions.
+ * They are handled directly.
+ */
+ octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET,
+ free_netbuf);
+
+ octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET_SG,
+ free_netsgbuf);
+
+ octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_RESP_NET_SG,
+ free_netsgbuf_with_resp);
+
for (i = 0; i < octeon_dev->ifcount; i++) {
resp_size = sizeof(struct liquidio_if_cfg_resp);
ctx_size = sizeof(struct liquidio_if_cfg_context);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next V2 1/7] liquidio CN23XX: VF offload features
From: Raghu Vatsavayi @ 2016-12-06 21:06 UTC (permalink / raw)
To: davem
Cc: netdev, Raghu Vatsavayi, Raghu Vatsavayi, Derek Chickles,
Satanand Burla, Felix Manlunas
In-Reply-To: <1481058367-3937-1-git-send-email-rvatsavayi@caviumnetworks.com>
Adds support for VF link initialization and offload features.
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@caviumnetworks.com>
Signed-off-by: Derek Chickles <derek.chickles@caviumnetworks.com>
Signed-off-by: Satanand Burla <satananda.burla@caviumnetworks.com>
Signed-off-by: Felix Manlunas <felix.manlunas@caviumnetworks.com>
---
drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 562 +++++++++++++++++++++
.../net/ethernet/cavium/liquidio/octeon_device.c | 3 +
2 files changed, 565 insertions(+)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index e6321f3..81a578f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -22,7 +22,9 @@
#include "octeon_iq.h"
#include "response_manager.h"
#include "octeon_device.h"
+#include "octeon_nic.h"
#include "octeon_main.h"
+#include "octeon_network.h"
#include "cn23xx_vf_device.h"
MODULE_AUTHOR("Cavium Networks, <support@cavium.com>");
@@ -30,6 +32,33 @@
MODULE_LICENSE("GPL");
MODULE_VERSION(LIQUIDIO_VERSION);
+static int debug = -1;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "NETIF_MSG debug bits");
+
+#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
+
+#define LIO_IFSTATE_REGISTERED 0x02
+#define LIO_IFSTATE_RUNNING 0x04
+
+struct liquidio_if_cfg_context {
+ int octeon_id;
+
+ wait_queue_head_t wc;
+
+ int cond;
+};
+
+struct liquidio_if_cfg_resp {
+ u64 rh;
+ struct liquidio_if_cfg_info cfg_info;
+ u64 status;
+};
+
+#define OCTNIC_GSO_MAX_HEADER_SIZE 128
+#define OCTNIC_GSO_MAX_SIZE \
+ (CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
+
struct octeon_device_priv {
/* Tasklet structures for this device. */
struct tasklet_struct droq_tasklet;
@@ -40,6 +69,7 @@ struct octeon_device_priv {
liquidio_vf_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
static void liquidio_vf_remove(struct pci_dev *pdev);
static int octeon_device_init(struct octeon_device *oct);
+static int liquidio_stop(struct net_device *netdev);
static int lio_wait_for_oq_pkts(struct octeon_device *oct)
{
@@ -113,6 +143,26 @@ static int wait_for_pending_requests(struct octeon_device *oct)
.remove = liquidio_vf_remove,
};
+/**
+ * \brief set interface state
+ * @param lio per-network private data
+ * @param state_flag flag state to set
+ */
+static inline void ifstate_set(struct lio *lio, int state_flag)
+{
+ atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) | state_flag));
+}
+
+/**
+ * \brief clear interface state
+ * @param lio per-network private data
+ * @param state_flag flag state to clear
+ */
+static inline void ifstate_reset(struct lio *lio, int state_flag)
+{
+ atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) & ~(state_flag)));
+}
+
static
int liquidio_schedule_msix_droq_pkt_handler(struct octeon_droq *droq, u64 ret)
{
@@ -316,6 +366,7 @@ static void octeon_destroy_resources(struct octeon_device *oct)
/* No more instructions will be forwarded. */
atomic_set(&oct->status, OCT_DEV_IN_RESET);
+ oct->app_mode = CVM_DRV_INVALID_APP;
dev_dbg(&oct->pci_dev->dev, "Device state is now %s\n",
lio_get_state_string(&oct->status));
@@ -420,6 +471,63 @@ static void octeon_destroy_resources(struct octeon_device *oct)
}
/**
+ * \brief Destroy NIC device interface
+ * @param oct octeon device
+ * @param ifidx which interface to destroy
+ *
+ * Cleanup associated with each interface for an Octeon device when NIC
+ * module is being unloaded or if initialization fails during load.
+ */
+static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
+{
+ struct net_device *netdev = oct->props[ifidx].netdev;
+ struct lio *lio;
+
+ if (!netdev) {
+ dev_err(&oct->pci_dev->dev, "%s No netdevice ptr for index %d\n",
+ __func__, ifidx);
+ return;
+ }
+
+ lio = GET_LIO(netdev);
+
+ dev_dbg(&oct->pci_dev->dev, "NIC device cleanup\n");
+
+ if (atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING)
+ liquidio_stop(netdev);
+
+ if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED)
+ unregister_netdev(netdev);
+
+ free_netdev(netdev);
+
+ oct->props[ifidx].gmxport = -1;
+
+ oct->props[ifidx].netdev = NULL;
+}
+
+/**
+ * \brief Stop complete NIC functionality
+ * @param oct octeon device
+ */
+static int liquidio_stop_nic_module(struct octeon_device *oct)
+{
+ int i;
+
+ dev_dbg(&oct->pci_dev->dev, "Stopping network interfaces\n");
+ if (!oct->ifcount) {
+ dev_err(&oct->pci_dev->dev, "Init for Octeon was not completed\n");
+ return 1;
+ }
+
+ for (i = 0; i < oct->ifcount; i++)
+ liquidio_destroy_nic_device(oct, i);
+
+ dev_dbg(&oct->pci_dev->dev, "Network interfaces stopped\n");
+ return 0;
+}
+
+/**
* \brief Cleans up resources at unload time
* @param pdev PCI device structure
*/
@@ -429,6 +537,9 @@ static void liquidio_vf_remove(struct pci_dev *pdev)
dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n");
+ if (oct_dev->app_mode == CVM_DRV_NIC_APP)
+ liquidio_stop_nic_module(oct_dev);
+
/* Reset the octeon device and cleanup all memory allocated for
* the octeon device by driver.
*/
@@ -472,6 +583,452 @@ static int octeon_pci_os_setup(struct octeon_device *oct)
}
/**
+ * \brief Callback for getting interface configuration
+ * @param status status of request
+ * @param buf pointer to resp structure
+ */
+static void if_cfg_callback(struct octeon_device *oct,
+ u32 status __attribute__((unused)), void *buf)
+{
+ struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
+ struct liquidio_if_cfg_context *ctx;
+ struct liquidio_if_cfg_resp *resp;
+
+ resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+ ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
+
+ oct = lio_get_device(ctx->octeon_id);
+ if (resp->status)
+ dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: %llx\n",
+ CVM_CAST64(resp->status));
+ WRITE_ONCE(ctx->cond, 1);
+
+ snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
+ resp->cfg_info.liquidio_firmware_version);
+
+ /* This barrier is required to be sure that the response has been
+ * written fully before waking up the handler
+ */
+ wmb();
+
+ wake_up_interruptible(&ctx->wc);
+}
+
+/**
+ * \brief Select queue based on hash
+ * @param dev Net device
+ * @param skb sk_buff structure
+ * @returns selected queue number
+ */
+static u16 select_q(struct net_device *dev, struct sk_buff *skb,
+ void *accel_priv __attribute__((unused)),
+ select_queue_fallback_t fallback __attribute__((unused)))
+{
+ struct lio *lio;
+ u32 qindex;
+
+ lio = GET_LIO(dev);
+
+ qindex = skb_tx_hash(dev, skb);
+
+ return (u16)(qindex % (lio->linfo.num_txpciq));
+}
+
+/**
+ * \brief Net device stop for LiquidIO
+ * @param netdev network device
+ */
+static int liquidio_stop(struct net_device *netdev)
+{
+ struct lio *lio = GET_LIO(netdev);
+ struct octeon_device *oct = lio->oct_dev;
+
+ netif_info(lio, ifdown, lio->netdev, "Stopping interface!\n");
+ /* Inform that netif carrier is down */
+ lio->intf_open = 0;
+ lio->linfo.link.s.link_up = 0;
+
+ netif_carrier_off(netdev);
+ lio->link_changes++;
+
+ ifstate_reset(lio, LIO_IFSTATE_RUNNING);
+
+ dev_info(&oct->pci_dev->dev, "%s interface is stopped\n", netdev->name);
+
+ return 0;
+}
+
+/** Sending command to enable/disable RX checksum offload
+ * @param netdev pointer to network device
+ * @param command OCTNET_CMD_TNL_RX_CSUM_CTL
+ * @param rx_cmd_bit OCTNET_CMD_RXCSUM_ENABLE/
+ * OCTNET_CMD_RXCSUM_DISABLE
+ * @returns SUCCESS or FAILURE
+ */
+static int liquidio_set_rxcsum_command(struct net_device *netdev, int command,
+ u8 rx_cmd)
+{
+ struct lio *lio = GET_LIO(netdev);
+ struct octeon_device *oct = lio->oct_dev;
+ struct octnic_ctrl_pkt nctrl;
+ int ret = 0;
+
+ nctrl.ncmd.u64 = 0;
+ nctrl.ncmd.s.cmd = command;
+ nctrl.ncmd.s.param1 = rx_cmd;
+ nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+ nctrl.wait_time = 100;
+ nctrl.netpndev = (u64)netdev;
+ nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+ ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+ if (ret < 0) {
+ dev_err(&oct->pci_dev->dev, "DEVFLAGS RXCSUM change failed in core (ret:0x%x)\n",
+ ret);
+ }
+ return ret;
+}
+
+/** \brief Net device fix features
+ * @param netdev pointer to network device
+ * @param request features requested
+ * @returns updated features list
+ */
+static netdev_features_t liquidio_fix_features(struct net_device *netdev,
+ netdev_features_t request)
+{
+ struct lio *lio = netdev_priv(netdev);
+
+ if ((request & NETIF_F_RXCSUM) &&
+ !(lio->dev_capability & NETIF_F_RXCSUM))
+ request &= ~NETIF_F_RXCSUM;
+
+ if ((request & NETIF_F_HW_CSUM) &&
+ !(lio->dev_capability & NETIF_F_HW_CSUM))
+ request &= ~NETIF_F_HW_CSUM;
+
+ if ((request & NETIF_F_TSO) && !(lio->dev_capability & NETIF_F_TSO))
+ request &= ~NETIF_F_TSO;
+
+ if ((request & NETIF_F_TSO6) && !(lio->dev_capability & NETIF_F_TSO6))
+ request &= ~NETIF_F_TSO6;
+
+ if ((request & NETIF_F_LRO) && !(lio->dev_capability & NETIF_F_LRO))
+ request &= ~NETIF_F_LRO;
+
+ /* Disable LRO if RXCSUM is off */
+ if (!(request & NETIF_F_RXCSUM) && (netdev->features & NETIF_F_LRO) &&
+ (lio->dev_capability & NETIF_F_LRO))
+ request &= ~NETIF_F_LRO;
+
+ return request;
+}
+
+/** \brief Net device set features
+ * @param netdev pointer to network device
+ * @param features features to enable/disable
+ */
+static int liquidio_set_features(struct net_device *netdev,
+ netdev_features_t features)
+{
+ struct lio *lio = netdev_priv(netdev);
+
+ if (!((netdev->features ^ features) & NETIF_F_LRO))
+ return 0;
+
+ if ((features & NETIF_F_LRO) && (lio->dev_capability & NETIF_F_LRO))
+ liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
+ OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+ else if (!(features & NETIF_F_LRO) &&
+ (lio->dev_capability & NETIF_F_LRO))
+ liquidio_set_feature(netdev, OCTNET_CMD_LRO_DISABLE,
+ OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+ if (!(netdev->features & NETIF_F_RXCSUM) &&
+ (lio->enc_dev_capability & NETIF_F_RXCSUM) &&
+ (features & NETIF_F_RXCSUM))
+ liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+ OCTNET_CMD_RXCSUM_ENABLE);
+ else if ((netdev->features & NETIF_F_RXCSUM) &&
+ (lio->enc_dev_capability & NETIF_F_RXCSUM) &&
+ !(features & NETIF_F_RXCSUM))
+ liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+ OCTNET_CMD_RXCSUM_DISABLE);
+
+ return 0;
+}
+
+static const struct net_device_ops lionetdevops = {
+ .ndo_fix_features = liquidio_fix_features,
+ .ndo_set_features = liquidio_set_features,
+ .ndo_select_queue = select_q,
+};
+
+/**
+ * \brief Setup network interfaces
+ * @param octeon_dev octeon device
+ *
+ * Called during init time for each device. It assumes the NIC
+ * is already up and running. The link information for each
+ * interface is passed in link_info.
+ */
+static int setup_nic_devices(struct octeon_device *octeon_dev)
+{
+ int retval, num_iqueues, num_oqueues;
+ struct liquidio_if_cfg_context *ctx;
+ u32 resp_size, ctx_size, data_size;
+ struct liquidio_if_cfg_resp *resp;
+ struct octeon_soft_command *sc;
+ union oct_nic_if_cfg if_cfg;
+ struct octdev_props *props;
+ struct net_device *netdev;
+ struct lio_version *vdata;
+ struct lio *lio = NULL;
+ u8 mac[ETH_ALEN], i, j;
+ u32 ifidx_or_pfnum;
+
+ ifidx_or_pfnum = octeon_dev->pf_num;
+
+ for (i = 0; i < octeon_dev->ifcount; i++) {
+ resp_size = sizeof(struct liquidio_if_cfg_resp);
+ ctx_size = sizeof(struct liquidio_if_cfg_context);
+ data_size = sizeof(struct lio_version);
+ sc = (struct octeon_soft_command *)
+ octeon_alloc_soft_command(octeon_dev, data_size,
+ resp_size, ctx_size);
+ resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+ ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
+ vdata = (struct lio_version *)sc->virtdptr;
+
+ *((u64 *)vdata) = 0;
+ vdata->major = cpu_to_be16(LIQUIDIO_BASE_MAJOR_VERSION);
+ vdata->minor = cpu_to_be16(LIQUIDIO_BASE_MINOR_VERSION);
+ vdata->micro = cpu_to_be16(LIQUIDIO_BASE_MICRO_VERSION);
+
+ WRITE_ONCE(ctx->cond, 0);
+ ctx->octeon_id = lio_get_device_id(octeon_dev);
+ init_waitqueue_head(&ctx->wc);
+
+ if_cfg.u64 = 0;
+
+ if_cfg.s.num_iqueues = octeon_dev->sriov_info.rings_per_vf;
+ if_cfg.s.num_oqueues = octeon_dev->sriov_info.rings_per_vf;
+ if_cfg.s.base_queue = 0;
+
+ sc->iq_no = 0;
+
+ octeon_prepare_soft_command(octeon_dev, sc, OPCODE_NIC,
+ OPCODE_NIC_IF_CFG, 0, if_cfg.u64,
+ 0);
+
+ sc->callback = if_cfg_callback;
+ sc->callback_arg = sc;
+ sc->wait_time = 5000;
+
+ retval = octeon_send_soft_command(octeon_dev, sc);
+ if (retval == IQ_SEND_FAILED) {
+ dev_err(&octeon_dev->pci_dev->dev,
+ "iq/oq config failed status: %x\n", retval);
+ /* Soft instr is freed by driver in case of failure. */
+ goto setup_nic_dev_fail;
+ }
+
+ /* Sleep on a wait queue till the cond flag indicates that the
+ * response arrived or timed-out.
+ */
+ if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR) {
+ dev_err(&octeon_dev->pci_dev->dev, "Wait interrupted\n");
+ goto setup_nic_wait_intr;
+ }
+
+ retval = resp->status;
+ if (retval) {
+ dev_err(&octeon_dev->pci_dev->dev, "iq/oq config failed\n");
+ goto setup_nic_dev_fail;
+ }
+
+ octeon_swap_8B_data((u64 *)(&resp->cfg_info),
+ (sizeof(struct liquidio_if_cfg_info)) >> 3);
+
+ num_iqueues = hweight64(resp->cfg_info.iqmask);
+ num_oqueues = hweight64(resp->cfg_info.oqmask);
+
+ if (!(num_iqueues) || !(num_oqueues)) {
+ dev_err(&octeon_dev->pci_dev->dev,
+ "Got bad iqueues (%016llx) or oqueues (%016llx) from firmware.\n",
+ resp->cfg_info.iqmask, resp->cfg_info.oqmask);
+ goto setup_nic_dev_fail;
+ }
+ dev_dbg(&octeon_dev->pci_dev->dev,
+ "interface %d, iqmask %016llx, oqmask %016llx, numiqueues %d, numoqueues %d\n",
+ i, resp->cfg_info.iqmask, resp->cfg_info.oqmask,
+ num_iqueues, num_oqueues);
+
+ netdev = alloc_etherdev_mq(LIO_SIZE, num_iqueues);
+
+ if (!netdev) {
+ dev_err(&octeon_dev->pci_dev->dev, "Device allocation failed\n");
+ goto setup_nic_dev_fail;
+ }
+
+ SET_NETDEV_DEV(netdev, &octeon_dev->pci_dev->dev);
+
+ /* Associate the routines that will handle different
+ * netdev tasks.
+ */
+ netdev->netdev_ops = &lionetdevops;
+
+ lio = GET_LIO(netdev);
+
+ memset(lio, 0, sizeof(struct lio));
+
+ lio->ifidx = ifidx_or_pfnum;
+
+ props = &octeon_dev->props[i];
+ props->gmxport = resp->cfg_info.linfo.gmxport;
+ props->netdev = netdev;
+
+ lio->linfo.num_rxpciq = num_oqueues;
+ lio->linfo.num_txpciq = num_iqueues;
+
+ for (j = 0; j < num_oqueues; j++) {
+ lio->linfo.rxpciq[j].u64 =
+ resp->cfg_info.linfo.rxpciq[j].u64;
+ }
+ for (j = 0; j < num_iqueues; j++) {
+ lio->linfo.txpciq[j].u64 =
+ resp->cfg_info.linfo.txpciq[j].u64;
+ }
+
+ lio->linfo.hw_addr = resp->cfg_info.linfo.hw_addr;
+ lio->linfo.gmxport = resp->cfg_info.linfo.gmxport;
+ lio->linfo.link.u64 = resp->cfg_info.linfo.link.u64;
+ lio->linfo.macaddr_is_admin_asgnd =
+ resp->cfg_info.linfo.macaddr_is_admin_asgnd;
+
+ lio->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
+
+ lio->dev_capability = NETIF_F_HIGHDMA
+ | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
+ | NETIF_F_SG | NETIF_F_RXCSUM
+ | NETIF_F_TSO | NETIF_F_TSO6
+ | NETIF_F_GRO
+ | NETIF_F_LRO;
+ netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
+
+ netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
+
+ netdev->hw_features = lio->dev_capability;
+
+ /* Point to the properties for octeon device to which this
+ * interface belongs.
+ */
+ lio->oct_dev = octeon_dev;
+ lio->octprops = props;
+ lio->netdev = netdev;
+
+ dev_dbg(&octeon_dev->pci_dev->dev,
+ "if%d gmx: %d hw_addr: 0x%llx\n", i,
+ lio->linfo.gmxport, CVM_CAST64(lio->linfo.hw_addr));
+
+ /* 64-bit swap required on LE machines */
+ octeon_swap_8B_data(&lio->linfo.hw_addr, 1);
+ for (j = 0; j < ETH_ALEN; j++)
+ mac[j] = *((u8 *)(((u8 *)&lio->linfo.hw_addr) + 2 + j));
+
+ /* Copy MAC Address to OS network device structure */
+ ether_addr_copy(netdev->dev_addr, mac);
+
+ if (netdev->features & NETIF_F_LRO)
+ liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE,
+ OCTNIC_LROIPV4 | OCTNIC_LROIPV6);
+
+ if ((debug != -1) && (debug & NETIF_MSG_HW))
+ liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE,
+ 0);
+
+ /* Register the network device with the OS */
+ if (register_netdev(netdev)) {
+ dev_err(&octeon_dev->pci_dev->dev, "Device registration failed\n");
+ goto setup_nic_dev_fail;
+ }
+
+ dev_dbg(&octeon_dev->pci_dev->dev,
+ "Setup NIC ifidx:%d mac:%02x%02x%02x%02x%02x%02x\n",
+ i, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ netif_carrier_off(netdev);
+ lio->link_changes++;
+
+ ifstate_set(lio, LIO_IFSTATE_REGISTERED);
+
+ /* Sending command to firmware to enable Rx checksum offload
+ * by default at the time of setup of Liquidio driver for
+ * this device
+ */
+ liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL,
+ OCTNET_CMD_RXCSUM_ENABLE);
+ liquidio_set_feature(netdev, OCTNET_CMD_TNL_TX_CSUM_CTL,
+ OCTNET_CMD_TXCSUM_ENABLE);
+
+ dev_dbg(&octeon_dev->pci_dev->dev,
+ "NIC ifidx:%d Setup successful\n", i);
+
+ octeon_free_soft_command(octeon_dev, sc);
+ }
+
+ return 0;
+
+setup_nic_dev_fail:
+
+ octeon_free_soft_command(octeon_dev, sc);
+
+setup_nic_wait_intr:
+
+ while (i--) {
+ dev_err(&octeon_dev->pci_dev->dev,
+ "NIC ifidx:%d Setup failed\n", i);
+ liquidio_destroy_nic_device(octeon_dev, i);
+ }
+ return -ENODEV;
+}
+
+/**
+ * \brief initialize the NIC
+ * @param oct octeon device
+ *
+ * This initialization routine is called once the Octeon device application is
+ * up and running
+ */
+static int liquidio_init_nic_module(struct octeon_device *oct)
+{
+ int num_nic_ports = 1;
+ int i, retval = 0;
+
+ dev_dbg(&oct->pci_dev->dev, "Initializing network interfaces\n");
+
+ /* only default iq and oq were initialized
+ * initialize the rest as well run port_config command for each port
+ */
+ oct->ifcount = num_nic_ports;
+ memset(oct->props, 0,
+ sizeof(struct octdev_props) * num_nic_ports);
+
+ for (i = 0; i < MAX_OCTEON_LINKS; i++)
+ oct->props[i].gmxport = -1;
+
+ retval = setup_nic_devices(oct);
+ if (retval) {
+ dev_err(&oct->pci_dev->dev, "Setup NIC devices failed\n");
+ goto octnet_init_failure;
+ }
+
+octnet_init_failure:
+
+ oct->ifcount = 0;
+
+ return retval;
+}
+
+/**
* \brief Device initialization for each Octeon device that is probed
* @param octeon_dev octeon device
*/
@@ -498,6 +1055,8 @@ static int octeon_device_init(struct octeon_device *oct)
atomic_set(&oct->status, OCT_DEV_PCI_MAP_DONE);
+ oct->app_mode = CVM_DRV_NIC_APP;
+
/* Initialize the dispatch mechanism used to push packets arriving on
* Octeon Output queues.
*/
@@ -594,6 +1153,9 @@ static int octeon_device_init(struct octeon_device *oct)
atomic_set(&oct->status, OCT_DEV_RUNNING);
+ if (liquidio_init_nic_module(oct))
+ return 1;
+
return 0;
}
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index 6d54032..583818e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -1221,6 +1221,9 @@ struct octeon_config *octeon_get_conf(struct octeon_device *oct)
} else if (OCTEON_CN23XX_PF(oct)) {
default_oct_conf = (struct octeon_config *)
(CHIP_CONF(oct, cn23xx_pf));
+ } else if (OCTEON_CN23XX_VF(oct)) {
+ default_oct_conf = (struct octeon_config *)
+ (CHIP_CONF(oct, cn23xx_vf));
}
return default_oct_conf;
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH nf-next v2] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-06 21:25 UTC (permalink / raw)
To: netfilter-devel; +Cc: fw, eric.dumazet, pablo, netdev, daniel, Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Add support for attaching an eBPF object by file descriptor.
The iptables binary can be called with a path to an elf object or a
pinned bpf object. Also pass the mode and path to the kernel to be
able to return it later for iptables dump and save.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
Changes
v1 -> v2
- define XT_BPF_PATH_MAX (== 512: does not grow structure size)
---
include/uapi/linux/netfilter/xt_bpf.h | 21 ++++++++
net/netfilter/xt_bpf.c | 96 +++++++++++++++++++++++++++++------
2 files changed, 101 insertions(+), 16 deletions(-)
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index 1fad2c2..b97725a 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -2,9 +2,11 @@
#define _XT_BPF_H
#include <linux/filter.h>
+#include <linux/limits.h>
#include <linux/types.h>
#define XT_BPF_MAX_NUM_INSTR 64
+#define XT_BPF_PATH_MAX (XT_BPF_MAX_NUM_INSTR * sizeof(struct sock_filter))
struct bpf_prog;
@@ -16,4 +18,23 @@ struct xt_bpf_info {
struct bpf_prog *filter __attribute__((aligned(8)));
};
+enum xt_bpf_modes {
+ XT_BPF_MODE_BYTECODE,
+ XT_BPF_MODE_FD_PINNED,
+ XT_BPF_MODE_FD_ELF,
+};
+
+struct xt_bpf_info_v1 {
+ __u16 mode;
+ __u16 bpf_program_num_elem;
+ __s32 fd;
+ union {
+ struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR];
+ char path[XT_BPF_PATH_MAX];
+ };
+
+ /* only used in the kernel */
+ struct bpf_prog *filter __attribute__((aligned(8)));
+};
+
#endif /*_XT_BPF_H */
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47..2dedaa2 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/netfilter/xt_bpf.h>
#include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_bpf");
MODULE_ALIAS("ip6t_bpf");
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+ struct bpf_prog **ret)
{
- struct xt_bpf_info *info = par->matchinfo;
struct sock_fprog_kern program;
- program.len = info->bpf_program_num_elem;
- program.filter = info->bpf_program;
+ program.len = len;
+ program.filter = insns;
- if (bpf_prog_create(&info->filter, &program)) {
+ if (bpf_prog_create(ret, &program)) {
pr_info("bpf: check failed: parse error\n");
return -EINVAL;
}
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ *ret = prog;
+ return 0;
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ if (info->mode == XT_BPF_MODE_BYTECODE)
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+ else if (info->mode == XT_BPF_MODE_FD_PINNED ||
+ info->mode == XT_BPF_MODE_FD_ELF)
+ return __bpf_mt_check_fd(info->fd, &info->filter);
+ else
+ return -EINVAL;
+}
+
static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
return BPF_PROG_RUN(info->filter, skb);
}
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
+
+ bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
bpf_prog_destroy(info->filter);
}
-static struct xt_match bpf_mt_reg __read_mostly = {
- .name = "bpf",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = bpf_mt_check,
- .match = bpf_mt,
- .destroy = bpf_mt_destroy,
- .matchsize = sizeof(struct xt_bpf_info),
- .me = THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+ {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "bpf",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check_v1,
+ .match = bpf_mt_v1,
+ .destroy = bpf_mt_destroy_v1,
+ .matchsize = sizeof(struct xt_bpf_info_v1),
+ .me = THIS_MODULE,
+ },
};
static int __init bpf_mt_init(void)
{
- return xt_register_match(&bpf_mt_reg);
+ return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
static void __exit bpf_mt_exit(void)
{
- xt_unregister_match(&bpf_mt_reg);
+ xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
module_init(bpf_mt_init);
--
2.8.0.rc3.226.g39d4020
^ permalink raw reply related
* Re: [PATCH v2 net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active
From: Saeed Mahameed @ 2016-12-06 21:40 UTC (permalink / raw)
To: Martin KaFai Lau
Cc: Linux Netdev List, Alexei Starovoitov, Brenden Blanco,
Daniel Borkmann, David Miller, Jesper Dangaard Brouer,
Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161206182718.GA16682@kafai-mba.local>
On Tue, Dec 6, 2016 at 8:27 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> On Tue, Dec 06, 2016 at 06:50:47PM +0200, Saeed Mahameed wrote:
>> On Mon, Dec 5, 2016 at 9:55 PM, Martin KaFai Lau <kafai@fb.com> wrote:
>> > On Mon, Dec 05, 2016 at 02:54:06AM +0200, Saeed Mahameed wrote:
>> >> On Sun, Dec 4, 2016 at 5:17 AM, Martin KaFai Lau <kafai@fb.com> wrote:
>> >> > Reserve XDP_PACKET_HEADROOM and honor bpf_xdp_adjust_head()
>> >> > when XDP prog is active. This patch only affects the code
>> >> > path when XDP is active.
>> >> >
>> >> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>> >> > ---
>> >>
>> >> Hi Martin, Sorry for the late review, i have some comments below
>> >>
>> >> > drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++++++++++++++--
>> >> > drivers/net/ethernet/mellanox/mlx4/en_rx.c | 23 +++++++++++++++++------
>> >> > drivers/net/ethernet/mellanox/mlx4/en_tx.c | 9 +++++----
>> >> > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++-
>> >> > 4 files changed, 39 insertions(+), 13 deletions(-)
>> >> >
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > index 311c14153b8b..094a13b52cf6 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
>> >> > @@ -51,7 +51,8 @@
>> >> > #include "mlx4_en.h"
>> >> > #include "en_port.h"
>> >> >
>> >> > -#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
>> >> > +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
>> >> > + XDP_PACKET_HEADROOM))
>> >> >
>> >> > int mlx4_en_setup_tc(struct net_device *dev, u8 up)
>> >> > {
>> >> > @@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev)
>> >> > struct mlx4_en_tx_ring *tx_ring;
>> >> > int rx_index = 0;
>> >> > int err = 0;
>> >> > + int mtu;
>> >> > int i, t;
>> >> > int j;
>> >> > u8 mc_list[16] = {0};
>> >> > @@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev)
>> >> > }
>> >> >
>> >> > /* Configure port */
>> >> > + mtu = priv->rx_skb_size + ETH_FCS_LEN;
>> >> > + if (priv->tx_ring_num[TX_XDP])
>> >> > + mtu += XDP_PACKET_HEADROOM;
>> >> > +
>> >>
>> >> Why would the physical MTU care for the headroom you preserve for XDP prog?
>> >> This is the wire MTU, it shouldn't be changed, please keep it as
>> >> before, any preservation you make in packets buffers are needed only
>> >> for FWD case or modify case (HW or wire should not care about them).
>> >
>> > Thanks for your feedback!
>>
>> Just doing my job :))
>>
>> >
>> > FWD:
>> > packet received from a port
>> > => process by a XDP prog
>> > => XDP_TX out to the same port.
>> >
>> > For example, if the received packet has 1500 payload and the XDP prog
>> > encapsulates it in an IPv6 header (+40 bytes). After testing, it cannot
>> > be sent out due to the HW/wire MTU is 1500.
>> >
>> > Even the wire MTU info was passed to the XDP prog, there is not much a
>> > XDP prog could do here other than dropping it.
>> >
>> > Hence, this patch gives guarantee to the XDP prog such that
>> > it can always send out what it has received + XDP_PACKET_HEADROOM.
>> >
>>
>> Still i am not convinced ! this is against common sense,
>> this means that the XDP prog can send packets larger than the MTU
>> seen on netdev!
>>
>> anyway if a packet with the size (MTU + XDP_PACKET_HEADROOM) was sent
>> from XDP ring and HW allowed it to exit somehow (with the code you
>> provided :)), most likely it will be dropped
>> at the other end.
> The MTU of our receiver side is larger than 1500.
>
> If the otherside could not handle >1500, we could lower the box running
> XDP prog to 1460.
>
This is exactly the user confusion we are trying to avoid.
Genuinely lowering the other side or dropping packets in XDP program
that are not eligible for edit&FWD (packets > MTU - required headroom
) will create the same effect. why don't you use this approach ?
dropping "large" packets in XDP seems the best solution.
> Just ensure we are on the same page. The rx MTU stays the same (1500)
> because the rx_desc's byte_count is not raised by XDP_PACKET_HEADROOM.
>
Yea it is clear,
One more reason not to do this: now packets that were dropped due to
"large MTU" HW drop cause, will now pass the HW check but will fail on
RX error (RX buffers are smaller than the wire MTU sized packet) this
counts as an error in both mlx5/4 which is not acceptable.
>>
>> I still think XDP prog should not be allowed to FW packets larger than
>> the MTU seen on the netdev and you shouldn't modify the wire MTU just
>> for this case.
>>
>> >>
>> >> > err = mlx4_SET_PORT_general(mdev->dev, priv->port,
>> >> > - priv->rx_skb_size + ETH_FCS_LEN,
>> >> > + mtu,
>> >> > priv->prof->tx_pause,
>> >> > priv->prof->tx_ppp,
>> >> > priv->prof->rx_pause,
>> >> > @@ -2255,6 +2261,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
>> >> > {
>> >> > struct mlx4_en_priv *priv = netdev_priv(dev);
>> >> >
>> >> > + if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) {
>> >> > + en_err(priv,
>> >> > + "Device max mtu:%d does not allow %d bytes reserved headroom for XDP prog\n",
>> >> > + priv->max_mtu, XDP_PACKET_HEADROOM);
>> >> > + return false;
>> >> > + }
>> >> > +
>> >> > if (mtu > MLX4_EN_MAX_XDP_MTU) {
>> >> > en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
>> >> > mtu, MLX4_EN_MAX_XDP_MTU);
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > index 23e9d04d1ef4..324771ac929e 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> >> > @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>> >> > struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
>> >> > const struct mlx4_en_frag_info *frag_info;
>> >> > struct page *page;
>> >> > - dma_addr_t dma;
>> >> > int i;
>> >> >
>> >> > for (i = 0; i < priv->num_frags; i++) {
>> >> > @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>> >> >
>> >> > for (i = 0; i < priv->num_frags; i++) {
>> >> > frags[i] = ring_alloc[i];
>> >> > - dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
>> >> > + frags[i].page_offset += priv->frag_info[i].rx_headroom;
>> >>
>> >> I don't see any need for headroom on frag_info other that frag0 (which
>> >> where the packet starts).
>> >> What is the meaning of a headroom of a frag in a middle of a packet ?
>> >>
>> >> if you agree with me then, you can use XDP_PACKET_HEADROOM as is where
>> >> needed (i.e frag0 page offset) and remove
>> >> "priv->frag_info[i].rx_headroom"
>> >>
>> >> ...
>> >>
>> >> After going through the code a little bit i see that this code is
>> >> shared between XDP and common path, and you didn't want to add boolean
>> >> conditions.
>> >>
>> >> Ok i see what you did here.
>> >>
>> >> Maybe we can pass headroom as a function parameter and split frag0
>> >> handling from the rest ?
>> >> If it is too much then i am ok with the code as it is,
>> > Right, this patch does the boolean check (XDP active or not) early on
>> > in mlx4_en_calc_rx_buf() (i.e. out of the fast path) and store
>> > the result in priv->frag_info[0].rx_headroom.
>> >
>> > Just want to ensure I understand your comment correctly.
>> > You prefer not to store the boolean test result in frag_info[0].rx_headroom
>> > since it is redundant to !!priv->tx_ring_num[TX_XDP] and rx_headroom is also
>> > confusing for frag[1-3].
>> >
>> > Instead, do the XDP [in]active test before calling mlx4_en_alloc_frags()
>> > and then only adjust frags[0].page_offset by +XDP_PACKET_HEADROOM if is needed.
>> > It could be done either by passing an extra argument to mlx4_en_alloc_frags()
>> > or completely separate mlx4_en_alloc_frags(). I am fine with this also.
>> >
>>
>> Correct, but if this change will add extra checks to the data path
>> then I am ok with the current code.
> Right, the check has to be done somewhere in the data path.
> Lets stay with the current approach then.
>
>>
>> >
>> >>
>> >> > + rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
>> >> > + frags[i].page_offset);
>> >> > ring_alloc[i] = page_alloc[i];
>> >> > - rx_desc->data[i].addr = cpu_to_be64(dma);
>> >> > }
>> >> >
>> >> > return 0;
>> >> > @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
>> >> >
>> >> > if (ring->page_cache.index > 0) {
>> >> > frags[0] = ring->page_cache.buf[--ring->page_cache.index];
>> >> > - rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
>> >> > + rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
>> >> > + frags[0].page_offset);
>> >> > return 0;
>> >> > }
>> >> >
>> >> > @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>> >> > if (xdp_prog) {
>> >> > struct xdp_buff xdp;
>> >> > dma_addr_t dma;
>> >> > + void *pg_addr, *orig_data;
>> >> > u32 act;
>> >> >
>> >> > dma = be64_to_cpu(rx_desc->data[0].addr);
>> >> > @@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>> >> > priv->frag_info[0].frag_size,
>> >> > DMA_FROM_DEVICE);
>> >> >
>> >> > - xdp.data = page_address(frags[0].page) +
>> >> > - frags[0].page_offset;
>> >> > + pg_addr = page_address(frags[0].page);
>> >> > + orig_data = pg_addr + frags[0].page_offset;
>> >> > + xdp.data = orig_data;
>> >> > xdp.data_end = xdp.data + length;
>> >> >
>> >> > act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> >> > +
>> >> > + if (xdp.data != orig_data) {
>> >> > + length = xdp.data_end - xdp.data;
>> >> > + frags[0].page_offset = xdp.data - pg_addr;
>> >> > + }
>> >> > +
>> >> >
>> >>
>> >> is this needed only for XDP FWD case ?
>> > No. It is also for PASS.
>> >
>>
>> I see.
>>
>> >> is this the only way to detect that the user modified the packet
>> >> headers (comparing pointers, before and after) ?
>> > Yes
>> >
>> >>
>> >> if the answer is yes, it should be faster to unconditionally reset
>> >> packet offset and lenght on XDP_FWD :
>> >> case XDP_FWD:
>> >> length = xdp.data_end - xdp.data;
>> >> frags[0].page_offset = xdp.data - pg_addr;
>> >>
>> >>
>> >> > switch (act) {
>> >> > case XDP_PASS:
>> >> > break;
>> >> > @@ -1180,6 +1189,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
>> >> > */
>> >> > priv->frag_info[0].frag_stride = PAGE_SIZE;
>> >> > priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
>> >> > + priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
>> >> > i = 1;
>> >> > } else {
>> >> > int buf_size = 0;
>> >> > @@ -1194,6 +1204,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
>> >> > ALIGN(priv->frag_info[i].frag_size,
>> >> > SMP_CACHE_BYTES);
>> >> > priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
>> >> > + priv->frag_info[i].rx_headroom = 0;
>> >>
>> >> IMHO, redundant. as you see here frag0 and other frags handling are
>> >> separated, maybe we can do the same in mlx4_en_alloc_frags.
>> >>
>> >> > buf_size += priv->frag_info[i].frag_size;
>> >> > i++;
>> >> > }
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > index 4b597dca5c52..9e5f38cefe5f 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> >> > @@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
>> >> > struct mlx4_en_rx_alloc frame = {
>> >> > .page = tx_info->page,
>> >> > .dma = tx_info->map0_dma,
>> >> > - .page_offset = 0,
>> >> > + .page_offset = XDP_PACKET_HEADROOM,
>> >> > .page_size = PAGE_SIZE,
>> >> > };
>> >> >
>> >> > @@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
>> >> > tx_info->page = frame->page;
>> >> > frame->page = NULL;
>> >> > tx_info->map0_dma = dma;
>> >> > - tx_info->map0_byte_count = length;
>> >> > + tx_info->map0_byte_count = length + frame->page_offset;
>> >>
>> >> Didn't you already take care of lenght by the following code:
>> >> if (xdp.data != orig_data) {
>> >> length = xdp.data_end - xdp.data;
>> >> frags[0].page_offset = xdp.data - pg_addr;
>> >> }
>> >>
>> > Before this patch, length always assumes the data starts at the beginning
>> > of the page and dma is the start of the page. Hence, adding
>> > framg->page_offset back to the length here.
>> >
>> > However, if I read the codes correctly, I think the map0_byte_count (before or
>> > after this patch) does not matter since it is only used in dma_unmap_page() and
>> > PAGE_SIZE is always used in dma_unmap_page() for this code patch. Hence, I think
>> > we can just set map0_byte_count to PAGE_SIZE here.
>> >
>>
>> Right, in mlx4_alloc_pages we always map with PAGE_SIZE << order
>> dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
>> frag_info->dma_dir);
>> for XDP order is always 0, so you can safely set it to PAGE_SIZE.
>>
>> >> and here frame->page_offset is not really page offset, it can only be
>> >> XDP_PACKET_HEADROOM.
>> > Note that the XDP prog can call bpf_xdp_adjust_head() to add a header.
>> > The XDP prog can extend up to XDP_PACKET_HEADROOM (256) bytes but it
>> > can also (and usually) only add 40 bytes IPv6 header and then XDP_TX it out.
>> >
>>
>> I see.
>>
>> >>
>> >> > tx_info->nr_txbb = nr_txbb;
>> >> > tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
>> >> > tx_info->data_offset = (void *)data - (void *)tx_desc;
>> >> > @@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
>> >> > tx_info->linear = 1;
>> >> > tx_info->inl = 0;
>> >> >
>> >> > - dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
>> >> > + dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
>> >> > + length, PCI_DMA_TODEVICE);
>> >> >
>> >> > - data->addr = cpu_to_be64(dma);
>> >> > + data->addr = cpu_to_be64(dma + frame->page_offset);
>> >> > data->lkey = ring->mr_key;
>> >> > dma_wmb();
>> >> > data->byte_count = cpu_to_be32(length);
>> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > index 20a936428f4a..ba1c6cd0cc79 100644
>> >> > --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> >> > @@ -475,7 +475,8 @@ struct mlx4_en_frag_info {
>> >> > u16 frag_prefix_size;
>> >> > u32 frag_stride;
>> >> > enum dma_data_direction dma_dir;
>> >> > - int order;
>> >> > + u16 order;
>> >> > + u16 rx_headroom;
>> >> > };
>> >> >
>> >> > #ifdef CONFIG_MLX4_EN_DCB
>> >> > --
>> >> > 2.5.1
>> >> >
^ permalink raw reply
* [PATCH net] netvsc: reduce maximum GSO size
From: Stephen Hemminger @ 2016-12-06 21:43 UTC (permalink / raw)
To: davem; +Cc: netdev, Stephen Hemminger
Hyper-V (and Azure) support using NVGRE which requires some extra space
for encapsulation headers. Because of this the largest allowed TSO
packet is reduced.
For older releases, hard code a fixed reduced value. For next release,
there is a better solution which uses result of host offload
negotiation.
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
Please queue this for stable as well.
drivers/net/hyperv/netvsc_drv.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f638215..c9140c3 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -47,6 +47,10 @@
NETIF_F_TSO | \
NETIF_F_TSO6 | \
NETIF_F_HW_CSUM)
+
+/* Restrict GSO size to account for NVGRE */
+#define NETVSC_GSO_MAX_SIZE 62768
+
static int ring_size = 128;
module_param(ring_size, int, S_IRUGO);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
@@ -1400,6 +1404,7 @@ static int netvsc_probe(struct hv_device *dev,
nvdev = net_device_ctx->nvdev;
netif_set_real_num_tx_queues(net, nvdev->num_chn);
netif_set_real_num_rx_queues(net, nvdev->num_chn);
+ netif_set_gso_max_size(net, NETVSC_GSO_MAX_SIZE);
ret = register_netdev(net);
if (ret != 0) {
--
2.10.2
^ permalink raw reply related
* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
From: tndave @ 2016-12-06 22:04 UTC (permalink / raw)
To: Alexander Duyck; +Cc: Jeff Kirsher, intel-wired-lan, Netdev
In-Reply-To: <CAKgT0UfTP+BrvDBzUJAVr9-DRCKgM7T3aS=LgRic8UZz8x82eg@mail.gmail.com>
On 12/06/2016 09:10 AM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>>
>>
>> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>>
>>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>>> wrote:
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [ 4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>>> [ 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>>> [ 6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>>> [ 7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>>> [ 8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>>> [ 9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval Transfer Bandwidth
>>>> [ 4] 0.0-20.0 sec 566 MBytes 237 Mbits/sec
>>>> [ 5] 0.0-20.0 sec 532 MBytes 223 Mbits/sec
>>>> [ 6] 0.0-20.0 sec 537 MBytes 225 Mbits/sec
>>>> [ 8] 0.0-20.0 sec 546 MBytes 229 Mbits/sec
>>>> [ 11] 0.0-20.0 sec 592 MBytes 248 Mbits/sec
>>>> [ 7] 0.0-20.0 sec 539 MBytes 226 Mbits/sec
>>>> [ 9] 0.0-20.0 sec 572 MBytes 240 Mbits/sec
>>>> [ 10] 0.0-20.0 sec 604 MBytes 253 Mbits/sec
>>>> [ 14] 0.0-20.0 sec 567 MBytes 238 Mbits/sec
>>>> [ 12] 0.0-20.0 sec 511 MBytes 214 Mbits/sec
>>>> [SUM] 0.0-20.0 sec 5.44 GBytes 2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>>> cookies. Check SNMP counters.
>>>> [ 4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>>> [ 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>>> [ 6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>>> [ 7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>>> [ 8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>>> [ 9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval Transfer Bandwidth
>>>> [ 4] 0.0-20.0 sec 7.45 GBytes 3.19 Gbits/sec
>>>> [ 5] 0.0-20.0 sec 7.48 GBytes 3.21 Gbits/sec
>>>> [ 7] 0.0-20.0 sec 7.34 GBytes 3.15 Gbits/sec
>>>> [ 8] 0.0-20.0 sec 7.42 GBytes 3.18 Gbits/sec
>>>> [ 9] 0.0-20.0 sec 7.24 GBytes 3.11 Gbits/sec
>>>> [ 10] 0.0-20.0 sec 7.40 GBytes 3.17 Gbits/sec
>>>> [ 12] 0.0-20.0 sec 7.49 GBytes 3.21 Gbits/sec
>>>> [ 6] 0.0-20.0 sec 7.30 GBytes 3.13 Gbits/sec
>>>> [ 11] 0.0-20.0 sec 7.44 GBytes 3.19 Gbits/sec
>>>> [ 13] 0.0-20.0 sec 7.22 GBytes 3.10 Gbits/sec
>>>> [SUM] 0.0-20.0 sec 73.8 GBytes 31.6 Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>
>>>
>>> You went through and replaced all of the dma_unmap/map_page calls with
>>> dma_map/unmap_single_attrs I would prefer you didn't do that. I have
>>
>> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
>> attr*
>>>
>>> patches to add the ability to map and unmap pages with attributes that
>>> should be available for 4.10-rc1 so if you could wait on this patch
>>> until then it would be preferred.
>>
>> :-) thanks. I will wait until your patches are out.
>>
>>>
>>>> ---
>>>> drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>> drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 +
>>>> 2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>>> i40e_ring *ring,
>>>> else
>>>> dev_kfree_skb_any(tx_buffer->skb);
>>>> if (dma_unmap_len(tx_buffer, len))
>>>> - dma_unmap_single(ring->dev,
>>>> - dma_unmap_addr(tx_buffer, dma),
>>>> - dma_unmap_len(tx_buffer, len),
>>>> - DMA_TO_DEVICE);
>>>> + dma_unmap_single_attrs(ring->dev,
>>>> + dma_unmap_addr(tx_buffer,
>>>> dma),
>>>> + dma_unmap_len(tx_buffer,
>>>> len),
>>>> + DMA_TO_DEVICE,
>>>> + ring->dma_attrs);
>>>> } else if (dma_unmap_len(tx_buffer, len)) {
>>>> - dma_unmap_page(ring->dev,
>>>> - dma_unmap_addr(tx_buffer, dma),
>>>> - dma_unmap_len(tx_buffer, len),
>>>> - DMA_TO_DEVICE);
>>>> + dma_unmap_single_attrs(ring->dev,
>>>> + dma_unmap_addr(tx_buffer, dma),
>>>> + dma_unmap_len(tx_buffer, len),
>>>> + DMA_TO_DEVICE,
>>>> + ring->dma_attrs);
>>>> }
>>>>
>>>> tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>> struct i40e_tx_buffer *tx_buf;
>>>> struct i40e_tx_desc *tx_head;
>>>> struct i40e_tx_desc *tx_desc;
>>>> + dma_addr_t addr;
>>>> + size_t size;
>>>> unsigned int total_bytes = 0, total_packets = 0;
>>>> unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>> napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>> /* unmap skb header data */
>>>> - dma_unmap_single(tx_ring->dev,
>>>> - dma_unmap_addr(tx_buf, dma),
>>>> - dma_unmap_len(tx_buf, len),
>>>> - DMA_TO_DEVICE);
>>>> + dma_unmap_single_attrs(tx_ring->dev,
>>>> + dma_unmap_addr(tx_buf, dma),
>>>> + dma_unmap_len(tx_buf, len),
>>>> + DMA_TO_DEVICE,
>>>> + tx_ring->dma_attrs);
>>>>
>>>> /* clear tx_buffer data */
>>>> tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>> tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>> }
>>>>
>>>> + addr = dma_unmap_addr(tx_buf, dma);
>>>> + size = dma_unmap_len(tx_buf, len);
>>>
>>>
>>> On some architectures this change could lead to issues since
>>> dma_unmap_len could be 0 meaning that addr would never be used.
>>
>> I see. Thanks.
>>
>>>
>>>> /* unmap any remaining paged data */
>>>> if (dma_unmap_len(tx_buf, len)) {
>>>> - dma_unmap_page(tx_ring->dev,
>>>> - dma_unmap_addr(tx_buf,
>>>> dma),
>>>> - dma_unmap_len(tx_buf,
>>>> len),
>>>> - DMA_TO_DEVICE);
>>>> + dma_unmap_single_attrs(tx_ring->dev,
>>>> + addr,
>>>> + size,
>>>> + DMA_TO_DEVICE,
>>>> +
>>>> tx_ring->dma_attrs);
>>>> dma_unmap_len_set(tx_buf, len, 0);
>>>> }
>>>> }
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>> */
>>>> tx_ring->size += sizeof(u32);
>>>> tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> + tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>>> +#else
>>>> + tx_ring->dma_attrs = 0;
>>>> +#endif
>>>> tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>> &tx_ring->dma, GFP_KERNEL);
>>>> if (!tx_ring->desc) {
>>>
>>>
>>> Also not a fan of adding yet ring attribute. Is there any reason why
>>> you couldn't simply add a set of inline functions at the start of
>>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>>> drivers? Then the x86 code doesn't have to change while the SPARC
>>> code will be able to be passed the attribute.
>>
>> Sure I can do that.
>>
>> I will follow up with patch after your patches for map/unmap page with dma
>> attr will be out.
>>
>> Thanks.
>>
>> -Tushar
>>
>
> I was thinking about it and I realized we can probably simplify this
> even further. In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway. So from what I can
> tell there is probably no reason not to just always pass that
> attribute with the DMA mappings. From what I can tell the only
> other architecture that uses this is the PowerPC Cell architecture.
Yes, besides SPARC64, only PowerPC Cell architecture uses
DMA_ATTR_WEAK_ORDERING; I guess it should be okay to always pass
DMA_ATTR_WEAK_ORDERING.
>
> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers? The patch that
> enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes,
> but I didn't see anything about reads. I'm just wondering if
> changing the code for Tx has any effect? If not you could probably
> drop those changes and just focus on Rx.
The patch I sent enabled DMA_ATTR_WEAK_ORDERING for sparc64 so that
write to & read from both rx and tx dma buffers can be relaxed order.
Passing DMA_ATTR_WEAK_ORDERING for tx dma buff doesn't have the same
impact as it has with DMA_ATTR_WEAK_ORDERING and rx dma buffers.
However, I can only confirm if DMA_ATTR_WEAK_ORDERING is not needed at
all for tx dma buffer after collecting some more data!
Thanks.
-Tushar
>
> Thanks.
>
> - Alex
>
^ permalink raw reply
* Re: [PATCH v2 net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active
From: Martin KaFai Lau @ 2016-12-06 22:25 UTC (permalink / raw)
To: Saeed Mahameed
Cc: Linux Netdev List, Alexei Starovoitov, Brenden Blanco,
Daniel Borkmann, David Miller, Jesper Dangaard Brouer,
Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <CALzJLG_+1fAv=i2uRF7cd_GZh2v66Pry=HusHmh6a_Nfq78CHg@mail.gmail.com>
On Tue, Dec 06, 2016 at 11:40:19PM +0200, Saeed Mahameed wrote:
> On Tue, Dec 6, 2016 at 8:27 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> > On Tue, Dec 06, 2016 at 06:50:47PM +0200, Saeed Mahameed wrote:
> >> On Mon, Dec 5, 2016 at 9:55 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> >> > On Mon, Dec 05, 2016 at 02:54:06AM +0200, Saeed Mahameed wrote:
> >> >> On Sun, Dec 4, 2016 at 5:17 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> >> >> > Reserve XDP_PACKET_HEADROOM and honor bpf_xdp_adjust_head()
> >> >> > when XDP prog is active. This patch only affects the code
> >> >> > path when XDP is active.
> >> >> >
> >> >> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> >> >> > ---
> >> >>
> >> >> Hi Martin, Sorry for the late review, i have some comments below
> >> >>
> >> >> > drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 17 +++++++++++++++--
> >> >> > drivers/net/ethernet/mellanox/mlx4/en_rx.c | 23 +++++++++++++++++------
> >> >> > drivers/net/ethernet/mellanox/mlx4/en_tx.c | 9 +++++----
> >> >> > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++-
> >> >> > 4 files changed, 39 insertions(+), 13 deletions(-)
> >> >> >
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > index 311c14153b8b..094a13b52cf6 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> >> >> > @@ -51,7 +51,8 @@
> >> >> > #include "mlx4_en.h"
> >> >> > #include "en_port.h"
> >> >> >
> >> >> > -#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
> >> >> > +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
> >> >> > + XDP_PACKET_HEADROOM))
> >> >> >
> >> >> > int mlx4_en_setup_tc(struct net_device *dev, u8 up)
> >> >> > {
> >> >> > @@ -1551,6 +1552,7 @@ int mlx4_en_start_port(struct net_device *dev)
> >> >> > struct mlx4_en_tx_ring *tx_ring;
> >> >> > int rx_index = 0;
> >> >> > int err = 0;
> >> >> > + int mtu;
> >> >> > int i, t;
> >> >> > int j;
> >> >> > u8 mc_list[16] = {0};
> >> >> > @@ -1684,8 +1686,12 @@ int mlx4_en_start_port(struct net_device *dev)
> >> >> > }
> >> >> >
> >> >> > /* Configure port */
> >> >> > + mtu = priv->rx_skb_size + ETH_FCS_LEN;
> >> >> > + if (priv->tx_ring_num[TX_XDP])
> >> >> > + mtu += XDP_PACKET_HEADROOM;
> >> >> > +
> >> >>
> >> >> Why would the physical MTU care for the headroom you preserve for XDP prog?
> >> >> This is the wire MTU, it shouldn't be changed, please keep it as
> >> >> before, any preservation you make in packets buffers are needed only
> >> >> for FWD case or modify case (HW or wire should not care about them).
> >> >
> >> > Thanks for your feedback!
> >>
> >> Just doing my job :))
> >>
> >> >
> >> > FWD:
> >> > packet received from a port
> >> > => process by a XDP prog
> >> > => XDP_TX out to the same port.
> >> >
> >> > For example, if the received packet has 1500 payload and the XDP prog
> >> > encapsulates it in an IPv6 header (+40 bytes). After testing, it cannot
> >> > be sent out due to the HW/wire MTU is 1500.
> >> >
> >> > Even the wire MTU info was passed to the XDP prog, there is not much a
> >> > XDP prog could do here other than dropping it.
> >> >
> >> > Hence, this patch gives guarantee to the XDP prog such that
> >> > it can always send out what it has received + XDP_PACKET_HEADROOM.
> >> >
> >>
> >> Still i am not convinced ! this is against common sense,
> >> this means that the XDP prog can send packets larger than the MTU
> >> seen on netdev!
> >>
> >> anyway if a packet with the size (MTU + XDP_PACKET_HEADROOM) was sent
> >> from XDP ring and HW allowed it to exit somehow (with the code you
> >> provided :)), most likely it will be dropped
> >> at the other end.
> > The MTU of our receiver side is larger than 1500.
> >
> > If the otherside could not handle >1500, we could lower the box running
> > XDP prog to 1460.
> >
>
> This is exactly the user confusion we are trying to avoid.
>
> Genuinely lowering the other side or dropping packets in XDP program
> that are not eligible for edit&FWD (packets > MTU - required headroom
> ) will create the same effect. why don't you use this approach ?
>
> dropping "large" packets in XDP seems the best solution.
Within the DC, yes we have absolute control on what to expect and we can even
lower the other end easily if it is needed. However, it may not be the case
for machines sitting at some exotic location.
After this thread, I think this bit may require more thoughts/discussions.
I will drop it now and revisit later since it is not user ABI related.
For now, lets check and drop at the driver side since the driver has the MTU
info.
>
> > Just ensure we are on the same page. The rx MTU stays the same (1500)
> > because the rx_desc's byte_count is not raised by XDP_PACKET_HEADROOM.
> >
>
> Yea it is clear,
>
> One more reason not to do this: now packets that were dropped due to
> "large MTU" HW drop cause, will now pass the HW check but will fail on
> RX error (RX buffers are smaller than the wire MTU sized packet) this
> counts as an error in both mlx5/4 which is not acceptable.
>
> >>
> >> I still think XDP prog should not be allowed to FW packets larger than
> >> the MTU seen on the netdev and you shouldn't modify the wire MTU just
> >> for this case.
> >>
> >> >>
> >> >> > err = mlx4_SET_PORT_general(mdev->dev, priv->port,
> >> >> > - priv->rx_skb_size + ETH_FCS_LEN,
> >> >> > + mtu,
> >> >> > priv->prof->tx_pause,
> >> >> > priv->prof->tx_ppp,
> >> >> > priv->prof->rx_pause,
> >> >> > @@ -2255,6 +2261,13 @@ static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
> >> >> > {
> >> >> > struct mlx4_en_priv *priv = netdev_priv(dev);
> >> >> >
> >> >> > + if (mtu + XDP_PACKET_HEADROOM > priv->max_mtu) {
> >> >> > + en_err(priv,
> >> >> > + "Device max mtu:%d does not allow %d bytes reserved headroom for XDP prog\n",
> >> >> > + priv->max_mtu, XDP_PACKET_HEADROOM);
> >> >> > + return false;
> >> >> > + }
> >> >> > +
> >> >> > if (mtu > MLX4_EN_MAX_XDP_MTU) {
> >> >> > en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
> >> >> > mtu, MLX4_EN_MAX_XDP_MTU);
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > index 23e9d04d1ef4..324771ac929e 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> >> >> > @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
> >> >> > struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
> >> >> > const struct mlx4_en_frag_info *frag_info;
> >> >> > struct page *page;
> >> >> > - dma_addr_t dma;
> >> >> > int i;
> >> >> >
> >> >> > for (i = 0; i < priv->num_frags; i++) {
> >> >> > @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
> >> >> >
> >> >> > for (i = 0; i < priv->num_frags; i++) {
> >> >> > frags[i] = ring_alloc[i];
> >> >> > - dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
> >> >> > + frags[i].page_offset += priv->frag_info[i].rx_headroom;
> >> >>
> >> >> I don't see any need for headroom on frag_info other that frag0 (which
> >> >> where the packet starts).
> >> >> What is the meaning of a headroom of a frag in a middle of a packet ?
> >> >>
> >> >> if you agree with me then, you can use XDP_PACKET_HEADROOM as is where
> >> >> needed (i.e frag0 page offset) and remove
> >> >> "priv->frag_info[i].rx_headroom"
> >> >>
> >> >> ...
> >> >>
> >> >> After going through the code a little bit i see that this code is
> >> >> shared between XDP and common path, and you didn't want to add boolean
> >> >> conditions.
> >> >>
> >> >> Ok i see what you did here.
> >> >>
> >> >> Maybe we can pass headroom as a function parameter and split frag0
> >> >> handling from the rest ?
> >> >> If it is too much then i am ok with the code as it is,
> >> > Right, this patch does the boolean check (XDP active or not) early on
> >> > in mlx4_en_calc_rx_buf() (i.e. out of the fast path) and store
> >> > the result in priv->frag_info[0].rx_headroom.
> >> >
> >> > Just want to ensure I understand your comment correctly.
> >> > You prefer not to store the boolean test result in frag_info[0].rx_headroom
> >> > since it is redundant to !!priv->tx_ring_num[TX_XDP] and rx_headroom is also
> >> > confusing for frag[1-3].
> >> >
> >> > Instead, do the XDP [in]active test before calling mlx4_en_alloc_frags()
> >> > and then only adjust frags[0].page_offset by +XDP_PACKET_HEADROOM if is needed.
> >> > It could be done either by passing an extra argument to mlx4_en_alloc_frags()
> >> > or completely separate mlx4_en_alloc_frags(). I am fine with this also.
> >> >
> >>
> >> Correct, but if this change will add extra checks to the data path
> >> then I am ok with the current code.
> > Right, the check has to be done somewhere in the data path.
> > Lets stay with the current approach then.
> >
> >>
> >> >
> >> >>
> >> >> > + rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
> >> >> > + frags[i].page_offset);
> >> >> > ring_alloc[i] = page_alloc[i];
> >> >> > - rx_desc->data[i].addr = cpu_to_be64(dma);
> >> >> > }
> >> >> >
> >> >> > return 0;
> >> >> > @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
> >> >> >
> >> >> > if (ring->page_cache.index > 0) {
> >> >> > frags[0] = ring->page_cache.buf[--ring->page_cache.index];
> >> >> > - rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
> >> >> > + rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
> >> >> > + frags[0].page_offset);
> >> >> > return 0;
> >> >> > }
> >> >> >
> >> >> > @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >> >> > if (xdp_prog) {
> >> >> > struct xdp_buff xdp;
> >> >> > dma_addr_t dma;
> >> >> > + void *pg_addr, *orig_data;
> >> >> > u32 act;
> >> >> >
> >> >> > dma = be64_to_cpu(rx_desc->data[0].addr);
> >> >> > @@ -896,11 +898,18 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >> >> > priv->frag_info[0].frag_size,
> >> >> > DMA_FROM_DEVICE);
> >> >> >
> >> >> > - xdp.data = page_address(frags[0].page) +
> >> >> > - frags[0].page_offset;
> >> >> > + pg_addr = page_address(frags[0].page);
> >> >> > + orig_data = pg_addr + frags[0].page_offset;
> >> >> > + xdp.data = orig_data;
> >> >> > xdp.data_end = xdp.data + length;
> >> >> >
> >> >> > act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >> >> > +
> >> >> > + if (xdp.data != orig_data) {
> >> >> > + length = xdp.data_end - xdp.data;
> >> >> > + frags[0].page_offset = xdp.data - pg_addr;
> >> >> > + }
> >> >> > +
> >> >> >
> >> >>
> >> >> is this needed only for XDP FWD case ?
> >> > No. It is also for PASS.
> >> >
> >>
> >> I see.
> >>
> >> >> is this the only way to detect that the user modified the packet
> >> >> headers (comparing pointers, before and after) ?
> >> > Yes
> >> >
> >> >>
> >> >> if the answer is yes, it should be faster to unconditionally reset
> >> >> packet offset and lenght on XDP_FWD :
> >> >> case XDP_FWD:
> >> >> length = xdp.data_end - xdp.data;
> >> >> frags[0].page_offset = xdp.data - pg_addr;
> >> >>
> >> >>
> >> >> > switch (act) {
> >> >> > case XDP_PASS:
> >> >> > break;
> >> >> > @@ -1180,6 +1189,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
> >> >> > */
> >> >> > priv->frag_info[0].frag_stride = PAGE_SIZE;
> >> >> > priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
> >> >> > + priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
> >> >> > i = 1;
> >> >> > } else {
> >> >> > int buf_size = 0;
> >> >> > @@ -1194,6 +1204,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
> >> >> > ALIGN(priv->frag_info[i].frag_size,
> >> >> > SMP_CACHE_BYTES);
> >> >> > priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE;
> >> >> > + priv->frag_info[i].rx_headroom = 0;
> >> >>
> >> >> IMHO, redundant. as you see here frag0 and other frags handling are
> >> >> separated, maybe we can do the same in mlx4_en_alloc_frags.
> >> >>
> >> >> > buf_size += priv->frag_info[i].frag_size;
> >> >> > i++;
> >> >> > }
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > index 4b597dca5c52..9e5f38cefe5f 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> >> >> > @@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
> >> >> > struct mlx4_en_rx_alloc frame = {
> >> >> > .page = tx_info->page,
> >> >> > .dma = tx_info->map0_dma,
> >> >> > - .page_offset = 0,
> >> >> > + .page_offset = XDP_PACKET_HEADROOM,
> >> >> > .page_size = PAGE_SIZE,
> >> >> > };
> >> >> >
> >> >> > @@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
> >> >> > tx_info->page = frame->page;
> >> >> > frame->page = NULL;
> >> >> > tx_info->map0_dma = dma;
> >> >> > - tx_info->map0_byte_count = length;
> >> >> > + tx_info->map0_byte_count = length + frame->page_offset;
> >> >>
> >> >> Didn't you already take care of lenght by the following code:
> >> >> if (xdp.data != orig_data) {
> >> >> length = xdp.data_end - xdp.data;
> >> >> frags[0].page_offset = xdp.data - pg_addr;
> >> >> }
> >> >>
> >> > Before this patch, length always assumes the data starts at the beginning
> >> > of the page and dma is the start of the page. Hence, adding
> >> > framg->page_offset back to the length here.
> >> >
> >> > However, if I read the codes correctly, I think the map0_byte_count (before or
> >> > after this patch) does not matter since it is only used in dma_unmap_page() and
> >> > PAGE_SIZE is always used in dma_unmap_page() for this code patch. Hence, I think
> >> > we can just set map0_byte_count to PAGE_SIZE here.
> >> >
> >>
> >> Right, in mlx4_alloc_pages we always map with PAGE_SIZE << order
> >> dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
> >> frag_info->dma_dir);
> >> for XDP order is always 0, so you can safely set it to PAGE_SIZE.
> >>
> >> >> and here frame->page_offset is not really page offset, it can only be
> >> >> XDP_PACKET_HEADROOM.
> >> > Note that the XDP prog can call bpf_xdp_adjust_head() to add a header.
> >> > The XDP prog can extend up to XDP_PACKET_HEADROOM (256) bytes but it
> >> > can also (and usually) only add 40 bytes IPv6 header and then XDP_TX it out.
> >> >
> >>
> >> I see.
> >>
> >> >>
> >> >> > tx_info->nr_txbb = nr_txbb;
> >> >> > tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
> >> >> > tx_info->data_offset = (void *)data - (void *)tx_desc;
> >> >> > @@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
> >> >> > tx_info->linear = 1;
> >> >> > tx_info->inl = 0;
> >> >> >
> >> >> > - dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE);
> >> >> > + dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
> >> >> > + length, PCI_DMA_TODEVICE);
> >> >> >
> >> >> > - data->addr = cpu_to_be64(dma);
> >> >> > + data->addr = cpu_to_be64(dma + frame->page_offset);
> >> >> > data->lkey = ring->mr_key;
> >> >> > dma_wmb();
> >> >> > data->byte_count = cpu_to_be32(length);
> >> >> > diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > index 20a936428f4a..ba1c6cd0cc79 100644
> >> >> > --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> >> >> > @@ -475,7 +475,8 @@ struct mlx4_en_frag_info {
> >> >> > u16 frag_prefix_size;
> >> >> > u32 frag_stride;
> >> >> > enum dma_data_direction dma_dir;
> >> >> > - int order;
> >> >> > + u16 order;
> >> >> > + u16 rx_headroom;
> >> >> > };
> >> >> >
> >> >> > #ifdef CONFIG_MLX4_EN_DCB
> >> >> > --
> >> >> > 2.5.1
> >> >> >
^ permalink raw reply
* [PATCH net-next 0/2] Add ethtool set regs support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, John W . Linville, Saeed Mahameed
Hi Dave,
This series adds the support for setting device registers from user
space ethtool.
Currently ethtool only allows to get device registers,
we extend ethtool functionality to also set device registers, by
introducing set_regs to ethtool_ops which will be invoked when
user space requests "ETHTOOL_SREGS", for example via ethtool user app:
ethtool -D DEVNAME [ file FILENAME ] is used to set registers in
the device using vendor specific binary registers data provided via
stdin/file. Changes made by this option can be queried using get
regs -d flag.
This simple ethool change will give HW vendors the flexibility to set
pure HW configurations (not directly related to netdev resources states
and rings), without the need of vendor proprietary tools and hacks.
2nd patch adds the support for ethtool set/get_regs in mlx5e driver.
Important Note: With this extension we will allow HW vendors to access (set) their
device register without the need for them to open their format, hence the binary
file passed on ethtool -D DEVNAME.
This means that the device driver MUST check for correctness/validity of the
registers data sent to it and whether this register is permitted to be iset form user space
in order to prevent the user from accessing/setting registers/Device configurations
that already standardized by the kernel/stack user APIs, or not allowed to be seen/set by user.
mlx5 driver have registers allowed access list and will check the user
Request validity before forwarding it to HW registers. Mlx5 will allow only mlx5 specific
configurations to be set (e.g. Device Diag Counters for HW performance debugging and analysis)
which has no standard API to access it.
Comments and redirections are more than welcome
This series was generated against commit:
b0da4f743db5 ("net: calxeda: xgmac: use new api ethtool_{get|set}_link_ksettings")
Thanks,
Saeed.
Gal Pressman (2):
ethtool: Add set regs -D option support
net/mlx5e: Add ethtool get/set reg support
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 19 ----
drivers/net/ethernet/mellanox/mlx5/core/en.h | 12 +++
.../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 21 ++++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++
drivers/net/ethernet/mellanox/mlx5/core/en_regs.c | 116 +++++++++++++++++++++
include/linux/ethtool.h | 1 +
include/linux/mlx5/mlx5_ifc.h | 22 ++++
include/uapi/linux/ethtool.h | 1 +
net/core/ethtool.c | 31 ++++++
10 files changed, 213 insertions(+), 20 deletions(-)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
--
2.7.4
^ permalink raw reply
* [PATCH net-next 2/2] net/mlx5e: Add ethtool get/set reg support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, John W . Linville, Gal Pressman, Dmitry Teif,
Saeed Mahameed
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>
From: Gal Pressman <galp@mellanox.com>
Add ethtool -[dD] callbacks support for get and set registers.
This interface allows users to query and change device registers.
Add the support for set/get DIAGNOSTIC_PARAMS/COUNTERS registers.
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Dmitry Teif <dimat@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 19 ----
drivers/net/ethernet/mellanox/mlx5/core/en.h | 12 +++
.../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 21 ++++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++
drivers/net/ethernet/mellanox/mlx5/core/en_regs.c | 116 +++++++++++++++++++++
include/linux/mlx5/mlx5_ifc.h | 22 ++++
7 files changed, 180 insertions(+), 20 deletions(-)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9f43beb..b24564c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -8,6 +8,6 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \
en_main.o en_common.o en_fs.o en_ethtool.o en_tx.o \
en_rx.o en_rx_am.o en_txrx.o en_clock.o vxlan.o \
- en_tc.o en_arfs.o en_rep.o en_fs_ethtool.o en_selftest.o
+ en_tc.o en_arfs.o en_rep.o en_fs_ethtool.o en_selftest.o en_regs.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b0448b5..f8b6c83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -650,25 +650,6 @@ static int cmd_status_to_err(u8 status)
}
}
-struct mlx5_ifc_mbox_out_bits {
- u8 status[0x8];
- u8 reserved_at_8[0x18];
-
- u8 syndrome[0x20];
-
- u8 reserved_at_40[0x40];
-};
-
-struct mlx5_ifc_mbox_in_bits {
- u8 opcode[0x10];
- u8 reserved_at_10[0x10];
-
- u8 reserved_at_20[0x10];
- u8 op_mod[0x10];
-
- u8 reserved_at_40[0x40];
-};
-
void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome)
{
*status = MLX5_GET(mbox_out, out, status);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 63dd639..fcc296b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -659,6 +659,11 @@ struct mlx5e_tir {
struct list_head list;
};
+struct mlx5e_reg {
+ u8 data_in[MLX5_ST_SZ_BYTES(mbox_in)];
+ u8 *data_out;
+};
+
enum {
MLX5E_TC_PRIO = 0,
MLX5E_NIC_PRIO
@@ -713,6 +718,7 @@ struct mlx5e_priv {
struct mlx5e_stats stats;
struct mlx5e_tstamp tstamp;
u16 q_counter;
+ struct mlx5e_reg *reg;
#ifdef CONFIG_MLX5_CORE_EN_DCB
struct mlx5e_dcbx dcbx;
#endif
@@ -803,6 +809,12 @@ int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
u8 cq_period_mode);
+struct mlx5e_reg *mlx5e_regs_init(void);
+int mlx5e_regs_set(struct net_device *dev, void *buff, int inlen);
+void mlx5e_regs_get(struct net_device *dev, void *buff);
+int mlx5e_regs_get_len(void);
+void mlx5e_regs_destroy(struct mlx5e_reg *reg);
+
static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 352462a..6adc9ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1552,6 +1552,23 @@ static u32 mlx5e_get_priv_flags(struct net_device *netdev)
return priv->params.pflags;
}
+static int mlx5e_get_regs_len(struct net_device *dev)
+{
+ return mlx5e_regs_get_len();
+}
+
+static void mlx5e_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+ void *buff)
+{
+ mlx5e_regs_get(dev, buff);
+}
+
+static int mlx5e_set_regs(struct net_device *dev, struct ethtool_regs *regs,
+ u8 *data)
+{
+ return mlx5e_regs_set(dev, data, regs->len);
+}
+
static int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
{
int err = 0;
@@ -1605,4 +1622,8 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
.get_priv_flags = mlx5e_get_priv_flags,
.set_priv_flags = mlx5e_set_priv_flags,
.self_test = mlx5e_self_test,
+ .get_regs_len = mlx5e_get_regs_len,
+ .get_regs = mlx5e_get_regs,
+ .set_regs = mlx5e_set_regs,
+
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9def5cc..e1905ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3686,6 +3686,11 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
mlx5e_build_nic_netdev(netdev);
+
+ priv->reg = mlx5e_regs_init();
+ if (!priv->reg)
+ mlx5_core_warn(mdev, "Failed to allocate mlx5e_reg\n");
+
mlx5e_vxlan_init(priv);
}
@@ -3696,6 +3701,9 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
mlx5e_vxlan_cleanup(priv);
+ if (priv->reg)
+ mlx5e_regs_destroy(priv->reg);
+
if (MLX5_CAP_GEN(mdev, vport_group_manager))
mlx5_eswitch_unregister_vport_rep(esw, 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
new file mode 100644
index 0000000..a83df1f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_regs.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "en.h"
+
+#define MLX5E_MAX_REG_LEN 4096
+#define MLX5E_MAX_CMD_OUT_LEN (MLX5E_MAX_REG_LEN - MLX5_ST_SZ_BYTES(mbox_in))
+
+static void reg_out_alloc(struct mlx5e_reg *reg)
+{
+ if (reg->data_out) {
+ memset(reg->data_out, 0, MLX5E_MAX_CMD_OUT_LEN);
+ return;
+ }
+
+ reg->data_out = mlx5_vzalloc(MLX5E_MAX_CMD_OUT_LEN);
+}
+
+struct mlx5e_reg *mlx5e_regs_init(void)
+{
+ return kzalloc(sizeof(struct mlx5e_reg), GFP_KERNEL);
+}
+
+void mlx5e_regs_destroy(struct mlx5e_reg *reg)
+{
+ kvfree(reg->data_out);
+ kfree(reg);
+}
+
+static bool opcode_valid(u16 opcode)
+{
+ switch (opcode) {
+ case MLX5_CMD_OP_QUERY_HCA_CAP:
+ case MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS:
+ case MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS:
+ case MLX5_CMD_OP_QUERY_DIAGNOSTICS_COUNTERS:
+ return true;
+ }
+
+ return false;
+}
+
+int mlx5e_regs_set(struct net_device *dev, void *buff, int inlen)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_core_dev *mdev = priv->mdev;
+ struct mlx5e_reg *reg = priv->reg;
+ u16 opcode;
+
+ if (!reg)
+ return -ENOMEM;
+
+ opcode = MLX5_GET(mbox_in, buff, opcode);
+ if (!opcode_valid(opcode))
+ return -EINVAL;
+
+ reg_out_alloc(reg);
+ if (!reg->data_out)
+ return -ENOMEM;
+
+ memcpy(reg->data_in, buff, sizeof(reg->data_in));
+
+ return mlx5_cmd_exec(mdev, buff, inlen, reg->data_out,
+ MLX5E_MAX_CMD_OUT_LEN);
+}
+
+void mlx5e_regs_get(struct net_device *dev, void *buff)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_reg *reg = priv->reg;
+
+ if (!reg)
+ return;
+
+ if (reg->data_out) {
+ memcpy(buff, reg->data_in, sizeof(reg->data_in));
+ memcpy(buff + sizeof(reg->data_in), reg->data_out,
+ MLX5E_MAX_CMD_OUT_LEN);
+ }
+}
+
+int mlx5e_regs_get_len(void)
+{
+ return MLX5E_MAX_REG_LEN;
+}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a5f0fbe..9738b70 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -169,6 +169,9 @@ enum {
MLX5_CMD_OP_DEALLOC_XRCD = 0x80f,
MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816,
MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN = 0x817,
+ MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS = 0x819,
+ MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS = 0x820,
+ MLX5_CMD_OP_QUERY_DIAGNOSTICS_COUNTERS = 0x821,
MLX5_CMD_OP_QUERY_CONG_STATUS = 0x822,
MLX5_CMD_OP_MODIFY_CONG_STATUS = 0x823,
MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x824,
@@ -230,6 +233,25 @@ enum {
MLX5_CMD_OP_MAX
};
+struct mlx5_ifc_mbox_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+ u8 opcode[0x10];
+ u8 reserved_at_10[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x40];
+};
+
struct mlx5_ifc_flow_table_fields_supported_bits {
u8 outer_dmac[0x1];
u8 outer_smac[0x1];
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 1/2] ethtool: Add set regs -D option support
From: Saeed Mahameed @ 2016-12-06 22:33 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, John W . Linville, Gal Pressman, Dmitry Teif,
Saeed Mahameed
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>
From: Gal Pressman <galp@mellanox.com>
Currently ethtool only allows us to get device registers, in this patch
we extend this functionality to also set device registers.
ethtool -D DEVNAME [ file FILENAME ] is used to set registers in
the device using vendor specific binary registers data provided via
stdin/file. Changes made by this option can be queried using get
regs -d flag.
Example:
$ ethtool -D eth1 file /tmp/mlx5_regs
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Dmitry Teif <dimat@mellanox.com>
CC: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
include/linux/ethtool.h | 1 +
include/uapi/linux/ethtool.h | 1 +
net/core/ethtool.c | 31 +++++++++++++++++++++++++++++++
3 files changed, 33 insertions(+)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9ded8c6..c9f5d37 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -305,6 +305,7 @@ struct ethtool_ops {
void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
int (*get_regs_len)(struct net_device *);
void (*get_regs)(struct net_device *, struct ethtool_regs *, void *);
+ int (*set_regs)(struct net_device *, struct ethtool_regs *, u8 *);
void (*get_wol)(struct net_device *, struct ethtool_wolinfo *);
int (*set_wol)(struct net_device *, struct ethtool_wolinfo *);
u32 (*get_msglevel)(struct net_device *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f0db778..f81c6fd 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1330,6 +1330,7 @@ struct ethtool_per_queue_op {
#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */
#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */
#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */
+#define ETHTOOL_SREGS 0x00000050 /* Set NIC registers */
/* compatibility with older code */
#define SPARC_ETH_GSET ETHTOOL_GSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e23766c..5548565 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1424,6 +1424,34 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
return ret;
}
+static int ethtool_set_regs(struct net_device *dev, char __user *useraddr)
+{
+ void __user *userbuf = useraddr + offsetof(struct ethtool_regs, data);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_regs regs;
+ int ret = 0;
+ u8 *data;
+
+ if (!ops->set_regs || !ops->get_regs_len)
+ return -EOPNOTSUPP;
+ if (copy_from_user(®s, useraddr, sizeof(regs)))
+ return -EFAULT;
+
+ data = kmalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(data, userbuf, regs.len))
+ goto out;
+
+ ret = ops->set_regs(dev, ®s, data);
+
+out:
+ kfree(data);
+ return ret;
+}
+
static int ethtool_reset(struct net_device *dev, char __user *useraddr)
{
struct ethtool_value reset;
@@ -2597,6 +2625,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GREGS:
rc = ethtool_get_regs(dev, useraddr);
break;
+ case ETHTOOL_SREGS:
+ rc = ethtool_set_regs(dev, useraddr);
+ break;
case ETHTOOL_GWOL:
rc = ethtool_get_wol(dev, useraddr);
break;
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next 7/7] bnxt_en: Add interface to support RDMA driver.
From: kbuild test robot @ 2016-12-06 22:33 UTC (permalink / raw)
To: Michael Chan
Cc: kbuild-all, davem, netdev, selvin.xavier, somnath.kotur, dledford,
linux-rdma
In-Reply-To: <1481044178-25193-8-git-send-email-michael.chan@broadcom.com>
[-- Attachment #1: Type: text/plain, Size: 5469 bytes --]
Hi Michael,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Michael-Chan/bnxt_en-Add-interface-to-support-RDMA-driver/20161207-053721
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c: In function 'bnxt_unregister_dev':
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c:97:4: warning: 'i' may be used uninitialized in this function [-Wmaybe-uninitialized]
i++;
~^~
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c: In function 'bnxt_ulp_stop':
>> drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c:219:6: warning: 'ops' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (!ops || !ops->ulp_stop)
^
vim +/i +97 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
91 RCU_INIT_POINTER(ulp->ulp_ops, NULL);
92 synchronize_rcu();
93 ulp->max_async_event_id = 0;
94 ulp->async_events_bmap = NULL;
95 while (atomic_read(&ulp->ref_count) != 0 && i < 10) {
96 msleep(100);
> 97 i++;
98 }
99 return 0;
100 }
101
102 static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
103 struct bnxt_msix_entry *ent, int num_msix)
104 {
105 struct net_device *dev = edev->net;
106 struct bnxt *bp = netdev_priv(dev);
107 int max_idx, max_cp_rings;
108 int avail_msix, i, idx;
109
110 ASSERT_RTNL();
111 if (ulp_id != BNXT_ROCE_ULP)
112 return -EINVAL;
113
114 if (!(bp->flags & BNXT_FLAG_USING_MSIX))
115 return -ENODEV;
116
117 max_cp_rings = bnxt_get_max_func_cp_rings(bp);
118 max_idx = min_t(int, bp->total_irqs, max_cp_rings);
119 avail_msix = max_idx - bp->cp_nr_rings;
120 if (!avail_msix)
121 return -ENOMEM;
122 if (avail_msix > num_msix)
123 avail_msix = num_msix;
124
125 idx = max_idx - avail_msix;
126 for (i = 0; i < avail_msix; i++) {
127 ent[i].vector = bp->irq_tbl[idx + i].vector;
128 ent[i].ring_idx = idx + i;
129 ent[i].db_offset = (idx + i) * 0x80;
130 }
131 bnxt_set_max_func_irqs(bp, max_idx - avail_msix);
132 bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
133 edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
134 return avail_msix;
135 }
136
137 static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
138 {
139 struct net_device *dev = edev->net;
140 struct bnxt *bp = netdev_priv(dev);
141 int max_cp_rings, msix_requested;
142
143 ASSERT_RTNL();
144 if (ulp_id != BNXT_ROCE_ULP)
145 return -EINVAL;
146
147 max_cp_rings = bnxt_get_max_func_cp_rings(bp);
148 msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
149 bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
150 edev->ulp_tbl[ulp_id].msix_requested = 0;
151 bnxt_set_max_func_irqs(bp, bp->total_irqs);
152 return 0;
153 }
154
155 void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
156 {
157 ASSERT_RTNL();
158 if (bnxt_ulp_registered(bp->edev, ulp_id)) {
159 struct bnxt_en_dev *edev = bp->edev;
160 unsigned int msix_req, max;
161
162 msix_req = edev->ulp_tbl[ulp_id].msix_requested;
163 max = bnxt_get_max_func_cp_rings(bp);
164 bnxt_set_max_func_cp_rings(bp, max - msix_req);
165 max = bnxt_get_max_func_stat_ctxs(bp);
166 bnxt_set_max_func_stat_ctxs(bp, max - 1);
167 }
168 }
169
170 static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
171 struct bnxt_fw_msg *fw_msg)
172 {
173 struct net_device *dev = edev->net;
174 struct bnxt *bp = netdev_priv(dev);
175 struct input *req;
176 int rc;
177
178 mutex_lock(&bp->hwrm_cmd_lock);
179 req = fw_msg->msg;
180 req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
181 rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
182 fw_msg->timeout);
183 if (!rc) {
184 struct output *resp = bp->hwrm_cmd_resp_addr;
185 u32 len = le16_to_cpu(resp->resp_len);
186
187 if (fw_msg->resp_max_len < len)
188 len = fw_msg->resp_max_len;
189
190 memcpy(fw_msg->resp, resp, len);
191 }
192 mutex_unlock(&bp->hwrm_cmd_lock);
193 return rc;
194 }
195
196 static void bnxt_ulp_get(struct bnxt_ulp *ulp)
197 {
198 atomic_inc(&ulp->ref_count);
199 }
200
201 static void bnxt_ulp_put(struct bnxt_ulp *ulp)
202 {
203 atomic_dec(&ulp->ref_count);
204 }
205
206 void bnxt_ulp_stop(struct bnxt *bp)
207 {
208 struct bnxt_en_dev *edev = bp->edev;
209 struct bnxt_ulp_ops *ops;
210 int i;
211
212 if (!edev)
213 return;
214
215 for (i = 0; i < BNXT_MAX_ULP; i++) {
216 struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
217
218 rtnl_dereference(ulp->ulp_ops);
> 219 if (!ops || !ops->ulp_stop)
220 continue;
221 ops->ulp_stop(ulp->handle);
222 }
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 57018 bytes --]
^ permalink raw reply
* Re: [PATCH nf-next] netfilter: xt_bpf: support ebpf
From: Willem de Bruijn @ 2016-12-06 22:44 UTC (permalink / raw)
To: Florian Westphal
Cc: Willem de Bruijn, Pablo Neira Ayuso, netfilter-devel,
Network Development, Daniel Borkmann, Eric Dumazet
In-Reply-To: <20161206002027.GD16819@breakpoint.cc>
On Mon, Dec 5, 2016 at 7:20 PM, Florian Westphal <fw@strlen.de> wrote:
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
>> While we're discussing the patch, another question, about revisions: I
>> tested both modified and original iptables binaries on both standard
>> and modified kernels. It all works as expected, except for the case
>> where both binaries are used on a single kernel. For instance:
>>
>> iptables -A OUTPUT -m bpf --bytecode "`./nfbpf_compile RAW 'udp port
>> 8000'`" -j LOG
>> ./iptables.new -L
>>
>> Here the new binary will interpret the object as xt_bpf_match_v1, but
>> iptables has inserted xt_bpf_match. The same problem happens the other
>> way around. A new binary can be made robust to detect old structs, but
>> not the other way around. Specific to bpf, the existing xt_bpf code
>> has an unfortunate bug that it always prints at least one line of
>> code, even if ->bpf_program_num_elems == 0.
>>
>> I notice that other extensions also do not necessarily only extend
>> struct vN in vN+1. Is the above a known issue?
>
> Yes, I guess noone ever bothered to fix this.
>
> The kernel blob should contain the match/target revision number,
> so userspace can in fact see that 'this is bpf v42', but iirc
> the netfilter userspace just loads the highest userspace revision
> supported by the kernel (which is then different for the 2 iptables
> binaries).
We can fall back on not parsing contents on mismatch:
diff --git a/iptables/iptables.c b/iptables/iptables.c
index 540d111..ada7c94 100644
--- a/iptables/iptables.c
+++ b/iptables/iptables.c
@@ -504,7 +504,8 @@ print_match(const struct xt_entry_match *m,
xtables_find_match(m->u.user.name, XTF_TRY_LOAD, NULL);
if (match) {
- if (match->print)
+ if (match->print &&
+ m->u.user.revision == match->revision)
match->print(ip, m, numeric);
else
printf("%s ", match->name);
> But we *could* display message like 'kernel uses revision 2 but I can
> only find 0 and 1' or fall back to the lower supported revision without
> guess-the-struct-by-size games.
That's a good idea. A special case printf() with a notice, then.
^ permalink raw reply related
* Re: [PATCH net-next 0/2] Add ethtool set regs support
From: Stephen Hemminger @ 2016-12-06 22:45 UTC (permalink / raw)
To: Saeed Mahameed; +Cc: David S. Miller, netdev, John W . Linville
In-Reply-To: <1481063590-7727-1-git-send-email-saeedm@mellanox.com>
On Wed, 7 Dec 2016 00:33:08 +0200
Saeed Mahameed <saeedm@mellanox.com> wrote:
> This simple ethool change will give HW vendors the flexibility to set
> pure HW configurations (not directly related to netdev resources states
> and rings), without the need of vendor proprietary tools and hacks.
The danger is you need to restrict the kernel to only allow setting
safe registers (and this is HW dependent). There are cases like secure
boot where it is expected that even root is not allowed to modify
all memory.
Also supporting closed format of device registers is not in the interest
of promoting open source.
I am not saying I fundamentally disagree with supporting this, but it
is a bigger step than you make it out to be.
^ permalink raw reply
* Re: [PATCH net-next V2 1/2] net/sched: cls_flower: Add support for matching on flags
From: kbuild test robot @ 2016-12-06 23:03 UTC (permalink / raw)
To: Or Gerlitz
Cc: kbuild-all, David S. Miller, netdev, Jiri Pirko, Roi Dayan,
Hadar Har-Zion, Or Gerlitz
In-Reply-To: <1481037486-27195-2-git-send-email-ogerlitz@mellanox.com>
[-- Attachment #1: Type: text/plain, Size: 1819 bytes --]
Hi Or,
[auto build test ERROR on net-next/master]
url: https://github.com/0day-ci/linux/commits/Or-Gerlitz/net-sched-cls_flower-Add-support-for-matching-on-flags/20161207-012247
config: arm-allmodconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm
Note: the linux-review/Or-Gerlitz/net-sched-cls_flower-Add-support-for-matching-on-flags/20161207-012247 HEAD 591ecce02e6ed3dab17d5c45a3f7368581c596ce builds fine.
It only hurts bisectibility.
All errors (new ones prefixed by >>):
In file included from include/net/pkt_cls.h:4:0,
from drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:34:
>> include/uapi/linux/pkt_cls.h:470:37: error: implicit declaration of function 'BIT' [-Werror=implicit-function-declaration]
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
^~~
>> include/uapi/linux/pkt_cls.h:470:2: error: enumerator value for 'TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT' is not an integer constant
TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/BIT +470 include/uapi/linux/pkt_cls.h
464 __TCA_FLOWER_MAX,
465 };
466
467 #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
468
469 enum {
> 470 TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = BIT(0),
471 };
472
473 /* Match-all classifier */
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 59574 bytes --]
^ permalink raw reply
* Soft lockup in inet_put_port on 4.6
From: Tom Herbert @ 2016-12-06 23:06 UTC (permalink / raw)
To: Linux Kernel Network Developers, Josef Bacik
Hello,
We are seeing a fair number of machines getting into softlockup in 4.6
kernel. As near as I can tell this is happening on the spinlock in
bind hash bucket. When inet_csk_get_port exits and does spinunlock_bh
the TCP timer runs and we hit lockup in inet_put_port (presumably on
same lock). It seems like the locked isn't properly be unlocked
somewhere but I don't readily see it.
Any ideas?
Thanks,
Tom
NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [proxygend:4152094]
Modules linked in: fuse nf_log_ipv6 ip6t_REJECT nf_reject_ipv6
nf_log_ipv4 nf_log_common xt_LOG ipt_REJECT nf_reject_ipv4 xt_limit
xt_multiport ipip ip_tunnel tunnel4 ip6_tunnel tunnel6 coretemp mptctl
mptbase cls_bpf ipmi_watchdog tcp_diag inet_diag ip6table_filter
xt_NFLOG nfnetlink_log xt_comment xt_statistic iptable_filter xt_mark
tpm_crb i2c_piix4 dm_crypt loop ipmi_devintf acpi_cpufreq iTCO_wdt
iTCO_vendor_support ipmi_si ipmi_msghandler efivars i2c_i801 sg
lpc_ich mfd_core hpilo xhci_pci xhci_hcd button nvme nvme_core
CPU: 22 PID: 4152094 Comm: proxygend Tainted: G W L
4.6.7-13_fbk3_1119_g367d67b #13
Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 12/27/2015
task: ffff88168c52d100 ti: ffff881c12fb0000 task.ti: ffff881c12fb0000
RIP: 0010:[<ffffffff810b87b8>] [<ffffffff810b87b8>]
queued_spin_lock_slowpath+0xf8/0x170
RSP: 0018:ffff883fff303da0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffff881257163e00 RCX: 0000000000000001
RDX: ffff883fff375e40 RSI: 00000000005c0000 RDI: ffffc90018d6bae0
RBP: ffff883fff303da0 R08: ffff883fff315e40 R09: 0000000000000000
R10: 0000000000000020 R11: 00000000000001c0 R12: ffffc90018d6bae0
R13: ffffffff820f8a80 R14: ffff881257163f30 R15: 0000000000000000
FS: 00007fa7bb7ff700(0000) GS:ffff883fff300000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ff4be114d90 CR3: 000000243f99c000 CR4: 00000000003406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack: ffff883fff303db0 ffffffff817e5910 ffff883fff303dd8 ffffffff8172f6b4
ffff881257163e00 0000000000000007 0000000000000004 ffff883fff303e00
ffffffff81733237 ffff881257163e00 0000000000000000 ffffffff81ce7cc0
Call Trace:
<IRQ>
[<ffffffff817e5910>] _raw_spin_lock+0x20/0x30
[<ffffffff8172f6b4>] inet_put_port+0x54/0xb0
[<ffffffff81733237>] tcp_set_state+0x67/0xc0
[<ffffffff81733a33>] tcp_done+0x33/0x90
[<ffffffff81746431>] tcp_write_err+0x31/0x50
[<ffffffff81746bc9>] tcp_retransmit_timer+0x119/0x7d0
[<ffffffff81747460>] ? tcp_write_timer_handler+0x1e0/0x1e0
[<ffffffff8174730e>] tcp_write_timer_handler+0x8e/0x1e0
[<ffffffff817474c7>] tcp_write_timer+0x67/0x70
[<ffffffff810ccc35>] call_timer_fn+0x35/0x120
[<ffffffff81747460>] ? tcp_write_timer_handler+0x1e0/0x1e0
[<ffffffff810cd01c>] run_timer_softirq+0x1fc/0x2b0
[<ffffffff817e811c>] __do_softirq+0xcc/0x26c
[<ffffffff817e753c>] do_softirq_own_stack+0x1c/0x30 <EOI>
[<ffffffff8107b481>] do_softirq+0x31/0x40
[<ffffffff8107b508>] __local_bh_enable_ip+0x78/0x80
[<ffffffff817e572a>] _raw_spin_unlock_bh +0x1a/0x20
[<ffffffff81730a61>] inet_csk_get_port+0x1c1/0x5a0
[<ffffffff816c7637>] ? sock_poll+0x47/0xb0
[<ffffffff817313f5>] inet_csk_listen_start+0x65/0xc0
[<ffffffff8175ea8c>] inet_listen+0x9c/0xe0
[<ffffffff816c8560>] SyS_listen+0x80/0x90
[<ffffffff817e5adb>] entry_SYSCALL_64_fastpath+0x13/0x8f
Code: c1 ea 0c 83 e8 01 83 e2 30 48 98 48 81 c2 40 5e 01 00 48 03 14
c5 c0 d4 d1 81 4c 89 02 41 8b 40 08 85 c0 75 0a f3 90 41 8b 40 08 <85>
c0 74 f6 4d 8b 08 4d 85 c9 74 08 41 0f 0d 09 eb 02 f3 90 8b
^ permalink raw reply
* Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.
From: Guillaume Nault @ 2016-12-06 23:08 UTC (permalink / raw)
To: Brad Campbell; +Cc: netdev, Thomas Graf, David Miller
In-Reply-To: <5d537b7e-97e9-709c-7b3e-61280cc264f8@fnarfbargle.com>
(Cc Thomas and David)
On Tue, Dec 06, 2016 at 03:47:20PM +0800, Brad Campbell wrote:
> On 06/12/16 01:53, Guillaume Nault wrote:
> > >
> > Probably not a mistake on your side. I've started looking at netcf'
> > source code, but haven't found anything that could explain your issue.
> > It'd really help if you could provide steps to reproduce the bug.
>
> Further to my message this morning, I started with a clean linux.git
> 4.9.0-rc7-00198-g0cb65c8 and did two runs. One untouched and one with the
> identified patch reverted. I logged both of these with NLCB=debug, then
> split out the ppp section and diffed them.
>
> It appears the only difference of note is the new ATTR 18. I did a diff of
> the entire dump for both and nothing else popped out.
>
Thanks for the detailed report. Things are getting clear now.
>
> brad@test:~$ diff -u ppp-ok ppp-fail
> --- ppp-ok 2016-12-06 13:32:04.358393578 +0800
> +++ ppp-fail 2016-12-06 13:32:18.577864406 +0800
> @@ -1,10 +1,10 @@
> -------------------------- BEGIN NETLINK MESSAGE
> ---------------------------
> [HEADER] 16 octets
> - .nlmsg_len = 628
> + .nlmsg_len = 644
> .nlmsg_type = 16 <route/link::new>
> .nlmsg_flags = 2 <MULTI>
> - .nlmsg_seq = 1481001940
> - .nlmsg_pid = 7462
> + .nlmsg_seq = 1481002252
> + .nlmsg_pid = 7376
> [PAYLOAD] 16 octets
> 00 00 00 02 0a 00 00 00 d1 10 01 00 00 00 00 00 ................
> [ATTR 03] 5 octets
> @@ -71,6 +71,8 @@
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ..................
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ..................
> 00 00 00 00 00 00 ......
> + [ATTR 18] 12 octets
> + 08 00 01 00 70 70 70 00 04 00 02 00 ....ppp.....
> [ATTR 26] 132 octets
> 84 00 02 00 80 00 01 00 01 00 00 00 00 00 00 00 00 00
> ..................
> 00 00 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00
> ..................
> @@ -81,3 +83,4 @@
> 00 00 00 00 10 27 00 00 e8 03 00 00 00 00 00 00 00 00
> .....'............
> 00 00 00 00 00 00 ......
> --------------------------- END NETLINK MESSAGE
> ---------------------------
>
'ATTR 18' is the IFLA_LINKINFO attribute. It contains two sub-attributes:
* IFLA_INFO_KIND ('08 00 01 00 70 70 70 00'), containing the "ppp"
string,
* and IFLA_INFO_DATA ('04 00 02 00') which has no payload because,
currently, ppp has no device specific data to return.
> Running with NLDBG=4 seems to generate this :
> DBG<2>: While picking up for 0x26d2e00 <route/link>, recvmsgs() returned
> -34: (errno = Numerical result out of range)DBG<1>: Clearing cache
> 0x26d2e00 <route/link>...
>
libnl1 rejects the IFLA_INFO_DATA attribute because it expects it to
contain a sub-attribute. Since the payload size is zero it doesn't
match the policy and parsing fails.
There's no problem with libnl3 because its policy accepts empty
payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826 "Be
liberal when receiving an empty nested attribute").
I think empty nested attributes make perfect sense. At least we accept
them from user space since commit ea5693ccc553 ("netlink: allow empty
nested attributes"), so it should be fine to generate some from the
kernel.
OTOH, since some user space programs broke because of this, it might be
better to always add attributes in the .fill_info() callbacks, to work
around libnl1's policy. David, Thomas, do you have any opinion on this?
^ permalink raw reply
* Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.
From: Dan Williams @ 2016-12-06 23:12 UTC (permalink / raw)
To: Guillaume Nault, Brad Campbell; +Cc: netdev, Thomas Graf, David Miller, thaller
In-Reply-To: <20161206230853.ukyg75cyxugdwg4a@alphalink.fr>
On Wed, 2016-12-07 at 00:08 +0100, Guillaume Nault wrote:
> (Cc Thomas and David)
CC Thomas Haller who is the current libnl maintainer...
Dan
> On Tue, Dec 06, 2016 at 03:47:20PM +0800, Brad Campbell wrote:
> >
> > On 06/12/16 01:53, Guillaume Nault wrote:
> > >
> > > >
> > > >
> > > Probably not a mistake on your side. I've started looking at
> > > netcf'
> > > source code, but haven't found anything that could explain your
> > > issue.
> > > It'd really help if you could provide steps to reproduce the bug.
> >
> > Further to my message this morning, I started with a clean
> > linux.git
> > 4.9.0-rc7-00198-g0cb65c8 and did two runs. One untouched and one
> > with the
> > identified patch reverted. I logged both of these with NLCB=debug,
> > then
> > split out the ppp section and diffed them.
> >
> > It appears the only difference of note is the new ATTR 18. I did a
> > diff of
> > the entire dump for both and nothing else popped out.
> >
> Thanks for the detailed report. Things are getting clear now.
>
> >
> >
> > brad@test:~$ diff -u ppp-ok ppp-fail
> > --- ppp-ok 2016-12-06 13:32:04.358393578 +0800
> > +++ ppp-fail 2016-12-06 13:32:18.577864406 +0800
> > @@ -1,10 +1,10 @@
> > -------------------------- BEGIN NETLINK MESSAGE
> > ---------------------------
> > [HEADER] 16 octets
> > - .nlmsg_len = 628
> > + .nlmsg_len = 644
> > .nlmsg_type = 16 <route/link::new>
> > .nlmsg_flags = 2 <MULTI>
> > - .nlmsg_seq = 1481001940
> > - .nlmsg_pid = 7462
> > + .nlmsg_seq = 1481002252
> > + .nlmsg_pid = 7376
> > [PAYLOAD] 16 octets
> > 00 00 00 02 0a 00 00 00 d1 10 01 00 00 00 00
> > 00 ................
> > [ATTR 03] 5 octets
> > @@ -71,6 +71,8 @@
> > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > ..................
> > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > ..................
> > 00 00 00 00 00 00 ......
> > + [ATTR 18] 12 octets
> > + 08 00 01 00 70 70 70 00 04 00 02
> > 00 ....ppp.....
> > [ATTR 26] 132 octets
> > 84 00 02 00 80 00 01 00 01 00 00 00 00 00 00 00 00 00
> > ..................
> > 00 00 01 00 00 00 01 00 00 00 01 00 00 00 01 00 00 00
> > ..................
> > @@ -81,3 +83,4 @@
> > 00 00 00 00 10 27 00 00 e8 03 00 00 00 00 00 00 00 00
> > .....'............
> > 00 00 00 00 00 00 ......
> > --------------------------- END NETLINK MESSAGE
> > ---------------------------
> >
> 'ATTR 18' is the IFLA_LINKINFO attribute. It contains two sub-
> attributes:
> * IFLA_INFO_KIND ('08 00 01 00 70 70 70 00'), containing the "ppp"
> string,
> * and IFLA_INFO_DATA ('04 00 02 00') which has no payload because,
> currently, ppp has no device specific data to return.
>
> >
> > Running with NLDBG=4 seems to generate this :
> > DBG<2>: While picking up for 0x26d2e00 <route/link>, recvmsgs()
> > returned
> > -34: (errno = Numerical result out of range)DBG<1>: Clearing cache
> > 0x26d2e00 <route/link>...
> >
> libnl1 rejects the IFLA_INFO_DATA attribute because it expects it to
> contain a sub-attribute. Since the payload size is zero it doesn't
> match the policy and parsing fails.
>
> There's no problem with libnl3 because its policy accepts empty
> payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826
> "Be
> liberal when receiving an empty nested attribute").
>
> I think empty nested attributes make perfect sense. At least we
> accept
> them from user space since commit ea5693ccc553 ("netlink: allow empty
> nested attributes"), so it should be fine to generate some from the
> kernel.
> OTOH, since some user space programs broke because of this, it might
> be
> better to always add attributes in the .fill_info() callbacks, to
> work
> around libnl1's policy. David, Thomas, do you have any opinion on
> this?
^ permalink raw reply
* ixgbe Port cannot load, "failed to register GSI"
From: Ben Greear @ 2016-12-06 23:22 UTC (permalink / raw)
To: netdev
We put 3 10-g dual-port ixgbe NICs and 4 4-port I350 NICs in a 2U rackmount, and one of the ixgbe ports
fails to come up. This previously worked before reboot, so maybe it is a race somehow. Kernel is 4.4.11+,
but not hacks to ixgbe or I350 drivers.
Anyone know if there is some sort of way to make this work reliably?
dmesg | grep ixgbe
[ 5.803307] ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 4.2.1-k
[ 5.803309] ixgbe: Copyright (c) 1999-2015 Intel Corporation.
[ 5.952119] ixgbe 0000:04:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[ 5.952245] ixgbe 0000:04:00.0: PCI Express bandwidth of 32GT/s available
[ 5.952246] ixgbe 0000:04:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[ 5.952328] ixgbe 0000:04:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[ 5.952330] ixgbe 0000:04:00.0: 00:e0:ed:77:09:16
[ 5.954004] ixgbe 0000:04:00.0: Intel(R) 10 Gigabit Network Connection
[ 6.102346] ixgbe 0000:04:00.1: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[ 6.102475] ixgbe 0000:04:00.1: PCI Express bandwidth of 32GT/s available
[ 6.102478] ixgbe 0000:04:00.1: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[ 6.102562] ixgbe 0000:04:00.1: MAC: 2, PHY: 15, SFP+: 6, PBA No: FFFFFF-0FF
[ 6.102564] ixgbe 0000:04:00.1: 00:e0:ed:77:09:17
[ 6.104869] ixgbe 0000:04:00.1: Intel(R) 10 Gigabit Network Connection
[ 6.253429] ixgbe 0000:05:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[ 6.253558] ixgbe 0000:05:00.0: PCI Express bandwidth of 32GT/s available
[ 6.253560] ixgbe 0000:05:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[ 6.253644] ixgbe 0000:05:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[ 6.253646] ixgbe 0000:05:00.0: 00:e0:ed:79:06:50
[ 6.255855] ixgbe 0000:05:00.0: Intel(R) 10 Gigabit Network Connection
[ 6.404128] ixgbe 0000:05:00.1: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[ 6.404254] ixgbe 0000:05:00.1: PCI Express bandwidth of 32GT/s available
[ 6.404255] ixgbe 0000:05:00.1: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[ 6.404337] ixgbe 0000:05:00.1: MAC: 2, PHY: 15, SFP+: 6, PBA No: FFFFFF-0FF
[ 6.404339] ixgbe 0000:05:00.1: 00:e0:ed:79:06:51
[ 6.405914] ixgbe 0000:05:00.1: Intel(R) 10 Gigabit Network Connection
[ 6.554373] ixgbe 0000:06:00.0: Multiqueue Enabled: Rx Queue count = 8, Tx Queue count = 8
[ 6.554501] ixgbe 0000:06:00.0: PCI Express bandwidth of 32GT/s available
[ 6.554504] ixgbe 0000:06:00.0: (Speed:5.0GT/s, Width: x8, Encoding Loss:20%)
[ 6.554588] ixgbe 0000:06:00.0: MAC: 2, PHY: 15, SFP+: 5, PBA No: FFFFFF-0FF
[ 6.554590] ixgbe 0000:06:00.0: 00:e0:ed:79:06:56
[ 6.556994] ixgbe 0000:06:00.0: Intel(R) 10 Gigabit Network Connection
[ 6.557160] ixgbe 0000:06:00.1: PCI INT B: failed to register GSI
[ 6.557169] ixgbe: probe of 0000:06:00.1 failed with error -28
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: [PATCH net-next 2/7] bnxt_en: Enable MSIX early in bnxt_init_one().
From: kbuild test robot @ 2016-12-06 23:55 UTC (permalink / raw)
To: Michael Chan
Cc: kbuild-all-JC7UmRfGjtg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
netdev-u79uwXL29TY76Z2rM5mHXA,
selvin.xavier-dY08KVG/lbpWk0Htik3J/w,
somnath.kotur-dY08KVG/lbpWk0Htik3J/w,
dledford-H+wXaHxf7aLQT0dZR+AlfA,
linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1481044178-25193-3-git-send-email-michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
[-- Attachment #1: Type: text/plain, Size: 8301 bytes --]
Hi Michael,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Michael-Chan/bnxt_en-Add-interface-to-support-RDMA-driver/20161207-053721
config: i386-randconfig-h1-12070631 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All warnings (new ones prefixed by >>):
drivers/net/ethernet/broadcom/bnxt/bnxt.c: In function 'bnxt_get_max_func_irqs':
>> drivers/net/ethernet/broadcom/bnxt/bnxt.c:4818:1: warning: control reaches end of non-void function [-Wreturn-type]
}
^
Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
Cyclomatic Complexity 5 include/linux/compiler.h:__write_once_size
Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:set_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:__set_bit
Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:clear_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:__clear_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:test_and_set_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:test_and_clear_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:fls
Cyclomatic Complexity 1 include/linux/log2.h:__ilog2_u32
Cyclomatic Complexity 1 include/linux/list.h:INIT_LIST_HEAD
Cyclomatic Complexity 1 include/asm-generic/getorder.h:__get_order
Cyclomatic Complexity 1 include/linux/err.h:ERR_PTR
Cyclomatic Complexity 1 arch/x86/include/asm/irqflags.h:arch_irqs_disabled_flags
Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:prefetch
Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_read
Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_set
Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_inc
Cyclomatic Complexity 1 arch/x86/include/asm/atomic.h:atomic_cmpxchg
Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_add
Cyclomatic Complexity 1 include/linux/bottom_half.h:__local_bh_disable_ip
Cyclomatic Complexity 1 include/linux/bottom_half.h:local_bh_disable
Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_lock
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_lock_bh
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_bh
Cyclomatic Complexity 1 include/linux/workqueue.h:__init_work
Cyclomatic Complexity 1 arch/x86/include/asm/topology.h:numa_node_id
Cyclomatic Complexity 1 include/linux/topology.h:numa_mem_id
Cyclomatic Complexity 1 include/linux/gfp.h:gfp_zonelist
Cyclomatic Complexity 1 include/linux/gfp.h:node_zonelist
Cyclomatic Complexity 1 include/linux/kasan.h:kasan_kmalloc
Cyclomatic Complexity 28 include/linux/slab.h:kmalloc_index
Cyclomatic Complexity 1 include/linux/slab.h:kmem_cache_alloc_trace
Cyclomatic Complexity 1 include/linux/slab.h:kmalloc_order_trace
Cyclomatic Complexity 67 include/linux/slab.h:kmalloc_large
Cyclomatic Complexity 5 include/linux/slab.h:kmalloc
Cyclomatic Complexity 1 include/linux/slab.h:kzalloc
Cyclomatic Complexity 1 arch/x86/include/asm/io.h:readl
Cyclomatic Complexity 1 arch/x86/include/asm/io.h:writel
Cyclomatic Complexity 1 include/linux/device.h:dev_get_drvdata
Cyclomatic Complexity 1 include/linux/device.h:dev_set_drvdata
Cyclomatic Complexity 1 include/linux/pci.h:pci_is_bridge
Cyclomatic Complexity 1 include/linux/mm.h:lowmem_page_address
Cyclomatic Complexity 1 include/linux/mm.h:page_is_pfmemalloc
Cyclomatic Complexity 1 include/linux/pci.h:pci_disable_msix
Cyclomatic Complexity 1 include/linux/pci.h:pci_enable_msix_range
Cyclomatic Complexity 1 include/linux/pci.h:pci_get_drvdata
Cyclomatic Complexity 1 include/linux/pci.h:pci_set_drvdata
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_map_page
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_mapping_error
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_unmap_page
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_alloc_coherent
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_free_coherent
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_sync_single_for_cpu
Cyclomatic Complexity 1 include/linux/dma-debug.h:debug_dma_sync_single_for_device
Cyclomatic Complexity 1 include/linux/kmemcheck.h:kmemcheck_mark_initialized
Cyclomatic Complexity 1 include/linux/dma-mapping.h:valid_dma_direction
Cyclomatic Complexity 1 arch/x86/include/asm/dma-mapping.h:get_dma_ops
Cyclomatic Complexity 2 include/linux/dma-mapping.h:dma_mapping_error
Cyclomatic Complexity 1 include/linux/dynamic_queue_limits.h:dql_avail
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_size
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_size_set
Cyclomatic Complexity 1 include/linux/skbuff.h:__skb_set_hash
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_set_hash
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_end_pointer
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_headlen
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_reserve
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_transport_header
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_header
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_transport_header
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_transport_offset
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_header_len
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_inner_network_offset
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_frag_page
Cyclomatic Complexity 1 include/linux/skbuff.h:__skb_frag_set_page
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_get_queue_mapping
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_record_rx_queue
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_is_gso
Cyclomatic Complexity 1 include/linux/skbuff.h:skb_checksum_none_assert
Cyclomatic Complexity 1 include/linux/netdevice.h:napi_disable_pending
Cyclomatic Complexity 3 include/linux/netdevice.h:napi_schedule_prep
Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_get_num_tc
Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_get_tx_queue
Cyclomatic Complexity 1 include/linux/netdevice.h:netdev_priv
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_tx_stop_queue
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_tx_queue_stopped
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_running
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_carrier_ok
Cyclomatic Complexity 1 include/linux/netdevice.h:__netif_tx_lock
Cyclomatic Complexity 1 include/linux/netdevice.h:__netif_tx_unlock
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_addr_lock_bh
Cyclomatic Complexity 1 include/linux/netdevice.h:netif_addr_unlock_bh
Cyclomatic Complexity 1 include/linux/etherdevice.h:is_zero_ether_addr
Cyclomatic Complexity 1 include/linux/etherdevice.h:is_multicast_ether_addr
vim +4818 drivers/net/ethernet/broadcom/bnxt/bnxt.c
4802 if (bp->flags & BNXT_FLAG_USING_MSIX)
4803 bnxt_setup_msix(bp);
4804 else
4805 bnxt_setup_inta(bp);
4806
4807 rc = bnxt_set_real_num_queues(bp);
4808 return rc;
4809 }
4810
4811 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
4812 {
4813 if (BNXT_PF(bp))
4814 return bp->pf.max_irqs;
4815 #if defined(CONFIG_BNXT_SRIOV)
4816 return bp->vf.max_irqs;
4817 #endif
> 4818 }
4819
4820 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
4821 {
4822 if (BNXT_PF(bp))
4823 bp->pf.max_irqs = max_irqs;
4824 #if defined(CONFIG_BNXT_SRIOV)
4825 else
4826 bp->vf.max_irqs = max_irqs;
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28906 bytes --]
^ permalink raw reply
* [PATCH v5 00/13] net: ethernet: ti: cpts: update and fixes
From: Grygorii Strashko @ 2016-12-07 0:00 UTC (permalink / raw)
To: David S. Miller, netdev, Mugunthan V N, Richard Cochran
Cc: Sekhar Nori, linux-kernel, linux-omap, devicetree,
Murali Karicheri, Wingman Kwok, Thomas Gleixner,
Grygorii Strashko
It is preparation series intended to clean up and optimize TI CPTS driver to
facilitate further integration with other TI's SoCs like Keystone 2.
Changes in v5:
- fixed copy paste error in cpts_release
- reworked cc.mult/shift and cc_mult initialization
Changes in v4:
- fixed build error in patch
"net: ethernet: ti: cpts: clean up event list if event pool is empty"
- rebased on top of net-next
Changes in v3:
- patches reordered: fixes and small updates moved first
- added comments in code about cpts->cc_mult
- conversation range (maxsec) limited to 10sec
Changes in v2:
- patch "net: ethernet: ti: cpts: rework initialization/deinitialization"
was split on 4 patches
- applied comments from Richard Cochran
- dropped patch
"net: ethernet: ti: cpts: add return value to tx and rx timestamp funcitons"
- new patches added:
"net: ethernet: ti: cpts: drop excessive writes to CTRL and INT_EN regs"
and "clocksource: export the clocks_calc_mult_shift to use by timestamp code"
Links on prev versions:
v4: https://lkml.org/lkml/2016/12/6/496
v3: https://www.spinics.net/lists/devicetree/msg153474.html
v2: http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1282034.html
v1: http://www.spinics.net/lists/linux-omap/msg131925.html
Grygorii Strashko (11):
net: ethernet: ti: cpts: switch to readl/writel_relaxed()
net: ethernet: ti: allow cpts to be built separately
net: ethernet: ti: cpsw: minimize direct access to struct cpts
net: ethernet: ti: cpts: fix unbalanced clk api usage in cpts_register/unregister
net: ethernet: ti: cpts: fix registration order
net: ethernet: ti: cpts: disable cpts when unregistered
net: ethernet: ti: cpts: drop excessive writes to CTRL and INT_EN regs
net: ethernet: ti: cpts: rework initialization/deinitialization
net: ethernet: ti: cpts: move dt props parsing to cpts driver
net: ethernet: ti: cpts: calc mult and shift from refclk freq
net: ethernet: ti: cpts: fix overflow check period
Murali Karicheri (1):
clocksource: export the clocks_calc_mult_shift to use by timestamp code
WingMan Kwok (1):
net: ethernet: ti: cpts: clean up event list if event pool is empty
Documentation/devicetree/bindings/net/cpsw.txt | 8 +-
drivers/net/ethernet/ti/Kconfig | 2 +-
drivers/net/ethernet/ti/Makefile | 3 +-
drivers/net/ethernet/ti/cpsw.c | 84 ++++-----
drivers/net/ethernet/ti/cpsw.h | 2 -
drivers/net/ethernet/ti/cpts.c | 233 ++++++++++++++++++-------
drivers/net/ethernet/ti/cpts.h | 80 ++++++++-
kernel/time/clocksource.c | 1 +
8 files changed, 297 insertions(+), 116 deletions(-)
--
2.10.1
^ permalink raw reply
* [PATCH v5 01/13] net: ethernet: ti: cpts: switch to readl/writel_relaxed()
From: Grygorii Strashko @ 2016-12-07 0:00 UTC (permalink / raw)
To: David S. Miller, netdev, Mugunthan V N, Richard Cochran
Cc: Sekhar Nori, linux-kernel, linux-omap, devicetree,
Murali Karicheri, Wingman Kwok, Thomas Gleixner,
Grygorii Strashko
In-Reply-To: <20161207000045.28333-1-grygorii.strashko@ti.com>
Switch to readl/writel_relaxed() APIs, because this is recommended
API and the CPTS IP is reused on Keystone 2 SoCs
where LE/BE modes are supported.
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
---
drivers/net/ethernet/ti/cpts.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 85a55b4..a42c449 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -33,8 +33,8 @@
#ifdef CONFIG_TI_CPTS
-#define cpts_read32(c, r) __raw_readl(&c->reg->r)
-#define cpts_write32(c, v, r) __raw_writel(v, &c->reg->r)
+#define cpts_read32(c, r) readl_relaxed(&c->reg->r)
+#define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r)
static int event_expired(struct cpts_event *event)
{
--
2.10.1
^ permalink raw reply related
* [PATCH v5 02/13] net: ethernet: ti: allow cpts to be built separately
From: Grygorii Strashko @ 2016-12-07 0:00 UTC (permalink / raw)
To: David S. Miller, netdev, Mugunthan V N, Richard Cochran
Cc: Sekhar Nori, linux-kernel, linux-omap, devicetree,
Murali Karicheri, Wingman Kwok, Thomas Gleixner,
Grygorii Strashko
In-Reply-To: <20161207000045.28333-1-grygorii.strashko@ti.com>
TI CPTS IP is used as part of TI OMAP CPSW driver, but it's also
present as part of NETCP on TI Keystone 2 SoCs. So, It's required
to enable build of CPTS for both this drivers and this can be
achieved by allowing CPTS to be built separately.
Hence, allow cpts to be built separately and convert it to be
a module as both CPSW and NETCP drives can be built as modules.
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
drivers/net/ethernet/ti/Kconfig | 2 +-
drivers/net/ethernet/ti/Makefile | 3 ++-
drivers/net/ethernet/ti/cpsw.c | 22 +++++++++++++++++-----
drivers/net/ethernet/ti/cpts.c | 16 ++++++++--------
drivers/net/ethernet/ti/cpts.h | 18 ++++++++++++++----
5 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 9904d74..ff7f518 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -74,7 +74,7 @@ config TI_CPSW
will be called cpsw.
config TI_CPTS
- bool "TI Common Platform Time Sync (CPTS) Support"
+ tristate "TI Common Platform Time Sync (CPTS) Support"
depends on TI_CPSW
select PTP_1588_CLOCK
---help---
diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile
index d420d94..1e7c10b 100644
--- a/drivers/net/ethernet/ti/Makefile
+++ b/drivers/net/ethernet/ti/Makefile
@@ -12,8 +12,9 @@ obj-$(CONFIG_TI_DAVINCI_MDIO) += davinci_mdio.o
obj-$(CONFIG_TI_DAVINCI_CPDMA) += davinci_cpdma.o
obj-$(CONFIG_TI_CPSW_PHY_SEL) += cpsw-phy-sel.o
obj-$(CONFIG_TI_CPSW_ALE) += cpsw_ale.o
+obj-$(CONFIG_TI_CPTS) += cpts.o
obj-$(CONFIG_TI_CPSW) += ti_cpsw.o
-ti_cpsw-y := cpsw.o cpts.o
+ti_cpsw-y := cpsw.o
obj-$(CONFIG_TI_KEYSTONE_NETCP) += keystone_netcp.o
keystone_netcp-y := netcp_core.o
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index f373a4b..8fdb274 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1594,7 +1594,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
return NETDEV_TX_BUSY;
}
-#ifdef CONFIG_TI_CPTS
+#if IS_ENABLED(CONFIG_TI_CPTS)
static void cpsw_hwtstamp_v1(struct cpsw_common *cpsw)
{
@@ -1742,7 +1742,16 @@ static int cpsw_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
}
+#else
+static int cpsw_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+ return -EOPNOTSUPP;
+}
+static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+ return -EOPNOTSUPP;
+}
#endif /*CONFIG_TI_CPTS*/
static int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
@@ -1755,12 +1764,10 @@ static int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
return -EINVAL;
switch (cmd) {
-#ifdef CONFIG_TI_CPTS
case SIOCSHWTSTAMP:
return cpsw_hwtstamp_set(dev, req);
case SIOCGHWTSTAMP:
return cpsw_hwtstamp_get(dev, req);
-#endif
}
if (!cpsw->slaves[slave_no].phy)
@@ -2100,10 +2107,10 @@ static void cpsw_set_msglevel(struct net_device *ndev, u32 value)
priv->msg_enable = value;
}
+#if IS_ENABLED(CONFIG_TI_CPTS)
static int cpsw_get_ts_info(struct net_device *ndev,
struct ethtool_ts_info *info)
{
-#ifdef CONFIG_TI_CPTS
struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
info->so_timestamping =
@@ -2120,7 +2127,12 @@ static int cpsw_get_ts_info(struct net_device *ndev,
info->rx_filters =
(1 << HWTSTAMP_FILTER_NONE) |
(1 << HWTSTAMP_FILTER_PTP_V2_EVENT);
+ return 0;
+}
#else
+static int cpsw_get_ts_info(struct net_device *ndev,
+ struct ethtool_ts_info *info)
+{
info->so_timestamping =
SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_RX_SOFTWARE |
@@ -2128,9 +2140,9 @@ static int cpsw_get_ts_info(struct net_device *ndev,
info->phc_index = -1;
info->tx_types = 0;
info->rx_filters = 0;
-#endif
return 0;
}
+#endif
static int cpsw_get_link_ksettings(struct net_device *ndev,
struct ethtool_link_ksettings *ecmd)
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index a42c449..8cb0369 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -31,8 +31,6 @@
#include "cpts.h"
-#ifdef CONFIG_TI_CPTS
-
#define cpts_read32(c, r) readl_relaxed(&c->reg->r)
#define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r)
@@ -334,6 +332,7 @@ void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb)
memset(ssh, 0, sizeof(*ssh));
ssh->hwtstamp = ns_to_ktime(ns);
}
+EXPORT_SYMBOL_GPL(cpts_rx_timestamp);
void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb)
{
@@ -349,13 +348,11 @@ void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb)
ssh.hwtstamp = ns_to_ktime(ns);
skb_tstamp_tx(skb, &ssh);
}
-
-#endif /*CONFIG_TI_CPTS*/
+EXPORT_SYMBOL_GPL(cpts_tx_timestamp);
int cpts_register(struct device *dev, struct cpts *cpts,
u32 mult, u32 shift)
{
-#ifdef CONFIG_TI_CPTS
int err, i;
unsigned long flags;
@@ -391,18 +388,21 @@ int cpts_register(struct device *dev, struct cpts *cpts,
schedule_delayed_work(&cpts->overflow_work, CPTS_OVERFLOW_PERIOD);
cpts->phc_index = ptp_clock_index(cpts->clock);
-#endif
return 0;
}
+EXPORT_SYMBOL_GPL(cpts_register);
void cpts_unregister(struct cpts *cpts)
{
-#ifdef CONFIG_TI_CPTS
if (cpts->clock) {
ptp_clock_unregister(cpts->clock);
cancel_delayed_work_sync(&cpts->overflow_work);
}
if (cpts->refclk)
cpts_clk_release(cpts);
-#endif
}
+EXPORT_SYMBOL_GPL(cpts_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("TI CPTS driver");
+MODULE_AUTHOR("Richard Cochran <richardcochran@gmail.com>");
diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h
index 69a46b9..416ba2c 100644
--- a/drivers/net/ethernet/ti/cpts.h
+++ b/drivers/net/ethernet/ti/cpts.h
@@ -111,7 +111,7 @@ struct cpts {
struct cpsw_cpts __iomem *reg;
int tx_enable;
int rx_enable;
-#ifdef CONFIG_TI_CPTS
+#if IS_ENABLED(CONFIG_TI_CPTS)
struct ptp_clock_info info;
struct ptp_clock *clock;
spinlock_t lock; /* protects time registers */
@@ -127,9 +127,11 @@ struct cpts {
#endif
};
-#ifdef CONFIG_TI_CPTS
+#if IS_ENABLED(CONFIG_TI_CPTS)
void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb);
void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb);
+int cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift);
+void cpts_unregister(struct cpts *cpts);
#else
static inline void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb)
{
@@ -137,9 +139,17 @@ static inline void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb)
static inline void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb)
{
}
+
+static inline int
+cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift)
+{
+ return 0;
+}
+
+static inline void cpts_unregister(struct cpts *cpts)
+{
+}
#endif
-int cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift);
-void cpts_unregister(struct cpts *cpts);
#endif
--
2.10.1
^ permalink raw reply related
* [PATCH v5 03/13] net: ethernet: ti: cpsw: minimize direct access to struct cpts
From: Grygorii Strashko @ 2016-12-07 0:00 UTC (permalink / raw)
To: David S. Miller, netdev, Mugunthan V N, Richard Cochran
Cc: Sekhar Nori, linux-kernel, linux-omap, devicetree,
Murali Karicheri, Wingman Kwok, Thomas Gleixner,
Grygorii Strashko
In-Reply-To: <20161207000045.28333-1-grygorii.strashko@ti.com>
This will provide more flexibility in changing CPTS internals and also
required for further changes.
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
drivers/net/ethernet/ti/cpsw.c | 28 +++++++++++++++-------------
drivers/net/ethernet/ti/cpts.h | 39 +++++++++++++++++++++++++++++++++++++++
2 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 8fdb274..7599895 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1562,7 +1562,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
}
if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
- cpsw->cpts->tx_enable)
+ cpts_is_tx_enabled(cpsw->cpts))
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
skb_tx_timestamp(skb);
@@ -1601,7 +1601,8 @@ static void cpsw_hwtstamp_v1(struct cpsw_common *cpsw)
struct cpsw_slave *slave = &cpsw->slaves[cpsw->data.active_slave];
u32 ts_en, seq_id;
- if (!cpsw->cpts->tx_enable && !cpsw->cpts->rx_enable) {
+ if (!cpts_is_tx_enabled(cpsw->cpts) &&
+ !cpts_is_rx_enabled(cpsw->cpts)) {
slave_write(slave, 0, CPSW1_TS_CTL);
return;
}
@@ -1609,10 +1610,10 @@ static void cpsw_hwtstamp_v1(struct cpsw_common *cpsw)
seq_id = (30 << CPSW_V1_SEQ_ID_OFS_SHIFT) | ETH_P_1588;
ts_en = EVENT_MSG_BITS << CPSW_V1_MSG_TYPE_OFS;
- if (cpsw->cpts->tx_enable)
+ if (cpts_is_tx_enabled(cpsw->cpts))
ts_en |= CPSW_V1_TS_TX_EN;
- if (cpsw->cpts->rx_enable)
+ if (cpts_is_rx_enabled(cpsw->cpts))
ts_en |= CPSW_V1_TS_RX_EN;
slave_write(slave, ts_en, CPSW1_TS_CTL);
@@ -1635,20 +1636,20 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
case CPSW_VERSION_2:
ctrl &= ~CTRL_V2_ALL_TS_MASK;
- if (cpsw->cpts->tx_enable)
+ if (cpts_is_tx_enabled(cpsw->cpts))
ctrl |= CTRL_V2_TX_TS_BITS;
- if (cpsw->cpts->rx_enable)
+ if (cpts_is_rx_enabled(cpsw->cpts))
ctrl |= CTRL_V2_RX_TS_BITS;
break;
case CPSW_VERSION_3:
default:
ctrl &= ~CTRL_V3_ALL_TS_MASK;
- if (cpsw->cpts->tx_enable)
+ if (cpts_is_tx_enabled(cpsw->cpts))
ctrl |= CTRL_V3_TX_TS_BITS;
- if (cpsw->cpts->rx_enable)
+ if (cpts_is_rx_enabled(cpsw->cpts))
ctrl |= CTRL_V3_RX_TS_BITS;
break;
}
@@ -1684,7 +1685,7 @@ static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
switch (cfg.rx_filter) {
case HWTSTAMP_FILTER_NONE:
- cpts->rx_enable = 0;
+ cpts_rx_enable(cpts, 0);
break;
case HWTSTAMP_FILTER_ALL:
case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
@@ -1700,14 +1701,14 @@ static int cpsw_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
case HWTSTAMP_FILTER_PTP_V2_EVENT:
case HWTSTAMP_FILTER_PTP_V2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
- cpts->rx_enable = 1;
+ cpts_rx_enable(cpts, 1);
cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
break;
default:
return -ERANGE;
}
- cpts->tx_enable = cfg.tx_type == HWTSTAMP_TX_ON;
+ cpts_tx_enable(cpts, cfg.tx_type == HWTSTAMP_TX_ON);
switch (cpsw->version) {
case CPSW_VERSION_1:
@@ -1736,8 +1737,9 @@ static int cpsw_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
return -EOPNOTSUPP;
cfg.flags = 0;
- cfg.tx_type = cpts->tx_enable ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
- cfg.rx_filter = (cpts->rx_enable ?
+ cfg.tx_type = cpts_is_tx_enabled(cpts) ?
+ HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
+ cfg.rx_filter = (cpts_is_rx_enabled(cpts) ?
HWTSTAMP_FILTER_PTP_V2_EVENT : HWTSTAMP_FILTER_NONE);
return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h
index 416ba2c..29a1e80c 100644
--- a/drivers/net/ethernet/ti/cpts.h
+++ b/drivers/net/ethernet/ti/cpts.h
@@ -132,6 +132,27 @@ void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb);
void cpts_tx_timestamp(struct cpts *cpts, struct sk_buff *skb);
int cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift);
void cpts_unregister(struct cpts *cpts);
+
+static inline void cpts_rx_enable(struct cpts *cpts, int enable)
+{
+ cpts->rx_enable = enable;
+}
+
+static inline bool cpts_is_rx_enabled(struct cpts *cpts)
+{
+ return !!cpts->rx_enable;
+}
+
+static inline void cpts_tx_enable(struct cpts *cpts, int enable)
+{
+ cpts->tx_enable = enable;
+}
+
+static inline bool cpts_is_tx_enabled(struct cpts *cpts)
+{
+ return !!cpts->tx_enable;
+}
+
#else
static inline void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb)
{
@@ -149,6 +170,24 @@ cpts_register(struct device *dev, struct cpts *cpts, u32 mult, u32 shift)
static inline void cpts_unregister(struct cpts *cpts)
{
}
+
+static inline void cpts_rx_enable(struct cpts *cpts, int enable)
+{
+}
+
+static inline bool cpts_is_rx_enabled(struct cpts *cpts)
+{
+ return false;
+}
+
+static inline void cpts_tx_enable(struct cpts *cpts, int enable)
+{
+}
+
+static inline bool cpts_is_tx_enabled(struct cpts *cpts)
+{
+ return false;
+}
#endif
--
2.10.1
^ permalink raw reply related
* [PATCH v5 04/13] net: ethernet: ti: cpts: fix unbalanced clk api usage in cpts_register/unregister
From: Grygorii Strashko @ 2016-12-07 0:00 UTC (permalink / raw)
To: David S. Miller, netdev, Mugunthan V N, Richard Cochran
Cc: Sekhar Nori, linux-kernel, linux-omap, devicetree,
Murali Karicheri, Wingman Kwok, Thomas Gleixner,
Grygorii Strashko
In-Reply-To: <20161207000045.28333-1-grygorii.strashko@ti.com>
There are two issues with TI CPTS code which are reproducible when TI
CPSW ethX device passes few up/down iterations:
- cpts refclk prepare counter continuously incremented after each
up/down iteration;
- devm_clk_get(dev, "cpts") is called many times.
Hence, fix these issues by using clk_disable_unprepare() in
cpts_clk_release() and skipping devm_clk_get() if cpts refclk has been
acquired already.
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
---
drivers/net/ethernet/ti/cpts.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 8cb0369..61198f1 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -230,18 +230,20 @@ static void cpts_overflow_check(struct work_struct *work)
static void cpts_clk_init(struct device *dev, struct cpts *cpts)
{
- cpts->refclk = devm_clk_get(dev, "cpts");
- if (IS_ERR(cpts->refclk)) {
- dev_err(dev, "Failed to get cpts refclk\n");
- cpts->refclk = NULL;
- return;
+ if (!cpts->refclk) {
+ cpts->refclk = devm_clk_get(dev, "cpts");
+ if (IS_ERR(cpts->refclk)) {
+ dev_err(dev, "Failed to get cpts refclk\n");
+ cpts->refclk = NULL;
+ return;
+ }
}
clk_prepare_enable(cpts->refclk);
}
static void cpts_clk_release(struct cpts *cpts)
{
- clk_disable(cpts->refclk);
+ clk_disable_unprepare(cpts->refclk);
}
static int cpts_match(struct sk_buff *skb, unsigned int ptp_class,
--
2.10.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox