All of lore.kernel.org
 help / color / mirror / Atom feed
From: Thomas Graf <tgraf-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
To: pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org,
	dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org,
	dev-VfR2kkLFssw@public.gmane.org,
	dpdk-ovs-y27Ovi1pjclAfugRpC6u6w@public.gmane.org
Cc: Gerald Rogers <gerald.rogers-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Subject: Re: [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports.
Date: Wed, 29 Jan 2014 11:01:32 +0100	[thread overview]
Message-ID: <52E8D17C.8050100@redhat.com> (raw)
In-Reply-To: <1390873715-26714-1-git-send-email-pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>

On 01/28/2014 02:48 AM, pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org wrote:
> From: Pravin B Shelar <pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
>
> Following patch adds DPDK netdev-class to userspace datapath.
> Approach taken in this patch differs from Intel® DPDK vSwitch
> where DPDK datapath switching is done in saparate process.  This
> patch adds support for DPDK type port and uses OVS userspace
> datapath for switching.  Therefore all DPDK processing and flow
> miss handling is done in single process.  This also avoids code
> duplication by reusing OVS userspace datapath switching and
> therefore it supports all flow matching and actions that
> user-space datapath supports.  Refer to INSTALL.DPDK doc for
> further info.
>
> With this patch I got similar performance for netperf TCP_STREAM
> tests compared to kernel datapath.
>
> This is based a patch from Gerald Rogers.
>
> Signed-off-by: Pravin B Shelar <pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
> CC: "Gerald Rogers" <gerald.rogers-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Pravin,

Some initial comments below. I will provide more after deeper
digging.

Do you have any ideas on how to implement the TX batching yet?

> +
> +static int
> +netdev_dpdk_rx_drain(struct netdev_rx *rx_)
> +{
> +    struct netdev_rx_dpdk *rx = netdev_rx_dpdk_cast(rx_);
> +    int pending;
> +    int i;
> +
> +    pending = rx->ofpbuf_cnt;
> +    if (pending) {

This conditional seems unneeded.

> +        for (i = 0; i < pending; i++) {
> +             build_ofpbuf(rx, &rx->ofpbuf[i], NULL);
> +        }
> +        rx->ofpbuf_cnt = 0;
> +        return 0;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Tx function. Transmit packets indefinitely */
> +static int
> +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    struct rte_mbuf *pkt;
> +    uint32_t nb_tx = 0;
> +
> +    pkt = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
> +    if (!pkt) {
> +        return 0;

Silent drop? ;-) Shouldn't these drops be accounted for somehow?

> +    }
> +
> +    /* We have to do a copy for now */
> +    memcpy(pkt->pkt.data, buf, size);
> +
> +    rte_pktmbuf_data_len(pkt) = size;
> +    rte_pktmbuf_pkt_len(pkt) = size;
> +
> +    rte_spinlock_lock(&dev->tx_lock);

What is the purpose of tx_lock here? Multiple threads writing to
the same Q? The lock is not acquired for the zerocopy path below.

> +    nb_tx = rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1);
> +    rte_spinlock_unlock(&dev->tx_lock);
> +
> +    if (nb_tx != 1) {
> +        /* free buffers if we couldn't transmit packets */
> +        rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> +    }
> +    return nb_tx;
> +}
> +
> +static int
> +netdev_dpdk_send(struct netdev *netdev,
> +                 struct ofpbuf *ofpbuf, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    if (ofpbuf->size > dev->max_packet_len) {
> +        VLOG_ERR("2big size %d max_packet_len %d",
> +                  (int)ofpbuf->size , dev->max_packet_len);

Should probably use VLOG_RATE_LIMIT_INIT

> +        return E2BIG;
> +    }
> +
> +    rte_prefetch0(&ofpbuf->private_p);
> +    if (!may_steal ||
> +        !ofpbuf->private_p || ofpbuf->source != OFPBUF_DPDK) {
> +        dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size);
> +    } else {
> +        struct rte_mbuf *pkt;
> +        uint32_t nb_tx;
> +        int qid;
> +
> +        pkt = ofpbuf->private_p;
> +        ofpbuf->private_p = NULL;
> +        rte_pktmbuf_data_len(pkt) = ofpbuf->size;
> +        rte_pktmbuf_pkt_len(pkt) = ofpbuf->size;
> +
> +        /* TODO: TX batching. */
> +        qid = rte_lcore_id() % NR_QUEUE;
> +        nb_tx = rte_eth_tx_burst(dev->port_id, qid, &pkt, 1);
> +        if (nb_tx != 1) {
> +            struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +            rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
> +            VLOG_ERR("TX error, zero packets sent");

Same here

> +       }
> +    }
> +    return 0;
> +}

> +static int
> +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int old_mtu, err;
> +    struct dpdk_mp *old_mp;
> +    struct dpdk_mp *mp;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    ovs_mutex_lock(&dev->mutex);
> +    if (dev->mtu == mtu) {
> +        err = 0;
> +        goto out;
> +    }
> +
> +    mp = dpdk_mp_get(dev->socket_id, dev->mtu);
> +    if (!mp) {
> +        err = ENOMEM;
> +        goto out;
> +    }
> +
> +    rte_eth_dev_stop(dev->port_id);
> +
> +    old_mtu = dev->mtu;
> +    old_mp = dev->dpdk_mp;
> +    dev->dpdk_mp = mp;
> +    dev->mtu = mtu;
> +    dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> +
> +    err = dpdk_eth_dev_init(dev);
> +    if (err) {
> +
> +        dpdk_mp_put(mp);
> +        dev->mtu = old_mtu;
> +        dev->dpdk_mp = old_mp;
> +        dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
> +        dpdk_eth_dev_init(dev);

Would be nice if we don't need these constructs and DPDK would
provide an all or nothing init method.

> +        goto out;
> +    }
> +
> +    dpdk_mp_put(old_mp);
> +out:
> +    ovs_mutex_unlock(&dev->mutex);
> +    ovs_mutex_unlock(&dpdk_mutex);
> +    return err;
> +}
> +

> +static int
> +netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
> +                           enum netdev_flags off, enum netdev_flags on,
> +                           enum netdev_flags *old_flagsp)
> +    OVS_REQUIRES(dev->mutex)
> +{
> +    int err;
> +
> +    if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
> +        return EINVAL;
> +    }
> +
> +    *old_flagsp = dev->flags;
> +    dev->flags |= on;
> +    dev->flags &= ~off;
> +
> +    if (dev->flags == *old_flagsp) {
> +        return 0;
> +    }
> +
> +    rte_eth_dev_stop(dev->port_id);
> +
> +    if (dev->flags & NETDEV_UP) {
> +        err = rte_eth_dev_start(dev->port_id);
> +        if (err)
> +            return err;
> +    }

I'm not a DPDK expert but is it required to restart the device
to change promisc settings or could we conditionally start and
stop based on the previous flags state?

> +
> +    if (dev->flags & NETDEV_PROMISC) {
> +        rte_eth_promiscuous_enable(dev->port_id);
> +        rte_eth_allmulticast_enable(dev->port_id);
> +    }
> +
> +    return 0;
> +}
>
> +
> +static void
> +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
> +                            const char *argv[], void *aux OVS_UNUSED)
> +{
> +    bool up;
> +
> +    if (!strcasecmp(argv[argc - 1], "up")) {
> +        up = true;
> +    } else if ( !strcasecmp(argv[argc - 1], "down")) {
> +        up = false;
> +    } else {
> +        unixctl_command_reply_error(conn, "Invalid Admin State");
> +        return;
> +    }
> +
> +    if (argc > 2) {
> +        struct netdev *netdev = netdev_from_name(argv[1]);

For future refinement: Usability would be increased if either a
strict one interface argument is enforced or multiple interface
names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up
or set-admin-state dpdk0 up dpdk1 up

As of now, dpdk1 is silently ignored which is not nice.

> +        if (netdev && is_dpdk_class(netdev->netdev_class)) {
> +            struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
> +
> +            ovs_mutex_lock(&dpdk_dev->mutex);
> +            netdev_dpdk_set_admin_state__(dpdk_dev, up);
> +            ovs_mutex_unlock(&dpdk_dev->mutex);
> +
> +            netdev_close(netdev);
> +        } else {
> +            unixctl_command_reply_error(conn, "Unknown Dummy Interface");

I think this should read "Not a DPDK Interface" or something similar.


> +            netdev_close(netdev);
> +            return;
> +        }
> +    } else {
> +        struct netdev_dpdk *netdev;
> +
> +        ovs_mutex_lock(&dpdk_mutex);
> +        LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
> +            ovs_mutex_lock(&netdev->mutex);
> +            netdev_dpdk_set_admin_state__(netdev, up);
> +            ovs_mutex_unlock(&netdev->mutex);
> +        }
> +        ovs_mutex_unlock(&dpdk_mutex);
> +    }
> +    unixctl_command_reply(conn, "OK");
> +}
> +
> +
> -    retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
> -    if (!retval) {
> -        COVERAGE_INC(netdev_received);

Are you removing the netdev_receive counter on purpose here?

  parent reply	other threads:[~2014-01-29 10:01 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-01-28  1:48 [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports pshelar-l0M0P4e3n4LQT0dZR+AlfA
     [not found] ` <1390873715-26714-1-git-send-email-pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2014-01-28  4:49   ` Ben Pfaff
     [not found]     ` <20140128044950.GA4545-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
2014-01-28  5:28       ` [ovs-dev] " Pravin Shelar
     [not found]         ` <CALnjE+pFMa86Uz_9LZiZ3p-VzJwazQ18kwfZ62m6=2LZqQA7tA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-01-28 14:47           ` Vincent JARDIN
     [not found]             ` <52E7C31A.7050409-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>
2014-01-28 15:54               ` Thomas Graf
     [not found]                 ` <52E7D2A8.400-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-28 18:20                   ` Pravin Shelar
2014-01-28 17:56               ` Pravin Shelar
     [not found]                 ` <CALnjE+qT-zCany+1t53paHmwkycJsftbEex6Q=OfEqQPNOt5nA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-01-29  0:15                   ` Vincent JARDIN
     [not found]                     ` <52E8482D.10804-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>
2014-01-29 19:32                       ` Pravin Shelar
2014-01-28 15:48   ` Thomas Graf
     [not found]     ` <52E7D13B.9020404-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-28 18:17       ` [ovs-dev] " Pravin Shelar
     [not found]         ` <CALnjE+rP29s8mkiKPtppt-a8jMn-B2qS7+re2ZBd8bK46ozUPA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-01-29  8:15           ` Thomas Graf
     [not found]             ` <52E8B88A.1070104-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-29 10:26               ` Vincent JARDIN
     [not found]                 ` <52E8D772.9070302-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>
2014-01-29 11:14                   ` Thomas Graf
     [not found]                     ` <52E8E2AB.1080600-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-29 16:34                       ` Vincent JARDIN
     [not found]                         ` <52E92DA6.9070704-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>
2014-01-29 17:14                           ` Thomas Graf
     [not found]                             ` <52E936D9.4010207-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-29 18:42                               ` Stephen Hemminger
2014-01-29 20:47                               ` François-Frédéric Ozog
2014-01-29 23:15                                 ` Thomas Graf
2014-03-13  7:37                                 ` David Nyström
2014-01-29  8:56   ` Prashant Upadhyaya
     [not found]     ` <C7CE7EEF248E2B48BBA63D0ABEEE700C5A2A24F762-2zbAqoMm/rLQX//ci7WS+53eMK7GYZcrXYFosVITYPE@public.gmane.org>
2014-01-29 21:29       ` Pravin Shelar
     [not found]         ` <CALnjE+rXdqzo2y9tKevsu7R5=kp-do+Yhv1Nr7Q9jKEC5Ror3A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-01-30 10:15           ` Prashant Upadhyaya
     [not found]             ` <C7CE7EEF248E2B48BBA63D0ABEEE700C5A2A24F97D-2zbAqoMm/rLQX//ci7WS+53eMK7GYZcrXYFosVITYPE@public.gmane.org>
2014-01-30 16:27               ` [dpdk-dev] " Rogers, Gerald
2014-01-29 10:01   ` Thomas Graf [this message]
     [not found]     ` <52E8D17C.8050100-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2014-01-29 21:49       ` [ovs-dev] " Pravin Shelar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52E8D17C.8050100@redhat.com \
    --to=tgraf-h+wxahxf7alqt0dzr+alfa@public.gmane.org \
    --cc=dev-VfR2kkLFssw@public.gmane.org \
    --cc=dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org \
    --cc=dpdk-ovs-y27Ovi1pjclAfugRpC6u6w@public.gmane.org \
    --cc=gerald.rogers-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.