From mboxrd@z Thu Jan 1 00:00:00 1970 From: Thomas Graf Subject: Re: [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports. Date: Wed, 29 Jan 2014 11:01:32 +0100 Message-ID: <52E8D17C.8050100@redhat.com> References: <1390873715-26714-1-git-send-email-pshelar@nicira.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: quoted-printable Cc: Gerald Rogers To: pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org, dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org, dev-VfR2kkLFssw@public.gmane.org, dpdk-ovs-y27Ovi1pjclAfugRpC6u6w@public.gmane.org Return-path: In-Reply-To: <1390873715-26714-1-git-send-email-pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org> List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces-VfR2kkLFssw@public.gmane.org Sender: "dev" On 01/28/2014 02:48 AM, pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org wrote: > From: Pravin B Shelar > > Following patch adds DPDK netdev-class to userspace datapath. > Approach taken in this patch differs from Intel=C2=AE DPDK vSwitch > where DPDK datapath switching is done in saparate process. This > patch adds support for DPDK type port and uses OVS userspace > datapath for switching. Therefore all DPDK processing and flow > miss handling is done in single process. This also avoids code > duplication by reusing OVS userspace datapath switching and > therefore it supports all flow matching and actions that > user-space datapath supports. Refer to INSTALL.DPDK doc for > further info. > > With this patch I got similar performance for netperf TCP_STREAM > tests compared to kernel datapath. > > This is based a patch from Gerald Rogers. > > Signed-off-by: Pravin B Shelar > CC: "Gerald Rogers" Pravin, Some initial comments below. I will provide more after deeper digging. Do you have any ideas on how to implement the TX batching yet? > + > +static int > +netdev_dpdk_rx_drain(struct netdev_rx *rx_) > +{ > + struct netdev_rx_dpdk *rx =3D netdev_rx_dpdk_cast(rx_); > + int pending; > + int i; > + > + pending =3D rx->ofpbuf_cnt; > + if (pending) { This conditional seems unneeded. > + for (i =3D 0; i < pending; i++) { > + build_ofpbuf(rx, &rx->ofpbuf[i], NULL); > + } > + rx->ofpbuf_cnt =3D 0; > + return 0; > + } > + > + return 0; > +} > + > +/* Tx function. Transmit packets indefinitely */ > +static int > +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size) > +{ > + struct netdev_dpdk *dev =3D netdev_dpdk_cast(netdev); > + struct rte_mbuf *pkt; > + uint32_t nb_tx =3D 0; > + > + pkt =3D rte_pktmbuf_alloc(dev->dpdk_mp->mp); > + if (!pkt) { > + return 0; Silent drop? ;-) Shouldn't these drops be accounted for somehow? > + } > + > + /* We have to do a copy for now */ > + memcpy(pkt->pkt.data, buf, size); > + > + rte_pktmbuf_data_len(pkt) =3D size; > + rte_pktmbuf_pkt_len(pkt) =3D size; > + > + rte_spinlock_lock(&dev->tx_lock); What is the purpose of tx_lock here? Multiple threads writing to the same Q? The lock is not acquired for the zerocopy path below. > + nb_tx =3D rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1); > + rte_spinlock_unlock(&dev->tx_lock); > + > + if (nb_tx !=3D 1) { > + /* free buffers if we couldn't transmit packets */ > + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1); > + } > + return nb_tx; > +} > + > +static int > +netdev_dpdk_send(struct netdev *netdev, > + struct ofpbuf *ofpbuf, bool may_steal) > +{ > + struct netdev_dpdk *dev =3D netdev_dpdk_cast(netdev); > + > + if (ofpbuf->size > dev->max_packet_len) { > + VLOG_ERR("2big size %d max_packet_len %d", > + (int)ofpbuf->size , dev->max_packet_len); Should probably use VLOG_RATE_LIMIT_INIT > + return E2BIG; > + } > + > + rte_prefetch0(&ofpbuf->private_p); > + if (!may_steal || > + !ofpbuf->private_p || ofpbuf->source !=3D OFPBUF_DPDK) { > + dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size); > + } else { > + struct rte_mbuf *pkt; > + uint32_t nb_tx; > + int qid; > + > + pkt =3D ofpbuf->private_p; > + ofpbuf->private_p =3D NULL; > + rte_pktmbuf_data_len(pkt) =3D ofpbuf->size; > + rte_pktmbuf_pkt_len(pkt) =3D ofpbuf->size; > + > + /* TODO: TX batching. */ > + qid =3D rte_lcore_id() % NR_QUEUE; > + nb_tx =3D rte_eth_tx_burst(dev->port_id, qid, &pkt, 1); > + if (nb_tx !=3D 1) { > + struct netdev_dpdk *dev =3D netdev_dpdk_cast(netdev); > + > + rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1); > + VLOG_ERR("TX error, zero packets sent"); Same here > + } > + } > + return 0; > +} > +static int > +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu) > +{ > + struct netdev_dpdk *dev =3D netdev_dpdk_cast(netdev); > + int old_mtu, err; > + struct dpdk_mp *old_mp; > + struct dpdk_mp *mp; > + > + ovs_mutex_lock(&dpdk_mutex); > + ovs_mutex_lock(&dev->mutex); > + if (dev->mtu =3D=3D mtu) { > + err =3D 0; > + goto out; > + } > + > + mp =3D dpdk_mp_get(dev->socket_id, dev->mtu); > + if (!mp) { > + err =3D ENOMEM; > + goto out; > + } > + > + rte_eth_dev_stop(dev->port_id); > + > + old_mtu =3D dev->mtu; > + old_mp =3D dev->dpdk_mp; > + dev->dpdk_mp =3D mp; > + dev->mtu =3D mtu; > + dev->max_packet_len =3D MTU_TO_MAX_LEN(dev->mtu); > + > + err =3D dpdk_eth_dev_init(dev); > + if (err) { > + > + dpdk_mp_put(mp); > + dev->mtu =3D old_mtu; > + dev->dpdk_mp =3D old_mp; > + dev->max_packet_len =3D MTU_TO_MAX_LEN(dev->mtu); > + dpdk_eth_dev_init(dev); Would be nice if we don't need these constructs and DPDK would provide an all or nothing init method. > + goto out; > + } > + > + dpdk_mp_put(old_mp); > +out: > + ovs_mutex_unlock(&dev->mutex); > + ovs_mutex_unlock(&dpdk_mutex); > + return err; > +} > + > +static int > +netdev_dpdk_update_flags__(struct netdev_dpdk *dev, > + enum netdev_flags off, enum netdev_flags on= , > + enum netdev_flags *old_flagsp) > + OVS_REQUIRES(dev->mutex) > +{ > + int err; > + > + if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) { > + return EINVAL; > + } > + > + *old_flagsp =3D dev->flags; > + dev->flags |=3D on; > + dev->flags &=3D ~off; > + > + if (dev->flags =3D=3D *old_flagsp) { > + return 0; > + } > + > + rte_eth_dev_stop(dev->port_id); > + > + if (dev->flags & NETDEV_UP) { > + err =3D rte_eth_dev_start(dev->port_id); > + if (err) > + return err; > + } I'm not a DPDK expert but is it required to restart the device to change promisc settings or could we conditionally start and stop based on the previous flags state? > + > + if (dev->flags & NETDEV_PROMISC) { > + rte_eth_promiscuous_enable(dev->port_id); > + rte_eth_allmulticast_enable(dev->port_id); > + } > + > + return 0; > +} > > + > +static void > +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc, > + const char *argv[], void *aux OVS_UNUSED) > +{ > + bool up; > + > + if (!strcasecmp(argv[argc - 1], "up")) { > + up =3D true; > + } else if ( !strcasecmp(argv[argc - 1], "down")) { > + up =3D false; > + } else { > + unixctl_command_reply_error(conn, "Invalid Admin State"); > + return; > + } > + > + if (argc > 2) { > + struct netdev *netdev =3D netdev_from_name(argv[1]); For future refinement: Usability would be increased if either a strict one interface argument is enforced or multiple interface names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up or set-admin-state dpdk0 up dpdk1 up As of now, dpdk1 is silently ignored which is not nice. > + if (netdev && is_dpdk_class(netdev->netdev_class)) { > + struct netdev_dpdk *dpdk_dev =3D netdev_dpdk_cast(netdev); > + > + ovs_mutex_lock(&dpdk_dev->mutex); > + netdev_dpdk_set_admin_state__(dpdk_dev, up); > + ovs_mutex_unlock(&dpdk_dev->mutex); > + > + netdev_close(netdev); > + } else { > + unixctl_command_reply_error(conn, "Unknown Dummy Interface= "); I think this should read "Not a DPDK Interface" or something similar. > + netdev_close(netdev); > + return; > + } > + } else { > + struct netdev_dpdk *netdev; > + > + ovs_mutex_lock(&dpdk_mutex); > + LIST_FOR_EACH (netdev, list_node, &dpdk_list) { > + ovs_mutex_lock(&netdev->mutex); > + netdev_dpdk_set_admin_state__(netdev, up); > + ovs_mutex_unlock(&netdev->mutex); > + } > + ovs_mutex_unlock(&dpdk_mutex); > + } > + unixctl_command_reply(conn, "OK"); > +} > + > + > - retval =3D rx->netdev->netdev_class->rx_recv(rx, buffer); > - if (!retval) { > - COVERAGE_INC(netdev_received); Are you removing the netdev_receive counter on purpose here?