All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jason Wang <jasowang@redhat.com>
To: sfeldma@gmail.com, qemu-devel@nongnu.org, jiri@resnulli.us,
	roopa@cumulusnetworks.com, john.fastabend@gmail.com,
	eblake@redhat.com, pbonzini@redhat.com, stefanha@gmail.com
Subject: Re: [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device
Date: Fri, 16 Jan 2015 17:15:52 +0800	[thread overview]
Message-ID: <54B8D6C8.6090405@redhat.com> (raw)
In-Reply-To: <1420948672-50103-7-git-send-email-sfeldma@gmail.com>


On 01/11/2015 11:57 AM, sfeldma@gmail.com wrote:
> From: Scott Feldman <sfeldma@gmail.com>
>
> Rocker is a simulated ethernet switch device.  The device supports up to 62
> front-panel ports and supports L2 switching and L3 routing functions, as well
> as L2/L3/L4 ACLs.  The device presents a single PCI device for each switch,
> with a memory-mapped register space for device driver access.
>
> Rocker device is invoked with -device, for example a 4-port switch:
>
>   -device rocker,name=sw1,len-ports=4,ports[0]=dev0,ports[1]=dev1, \
>          ports[2]=dev2,ports[3]=dev3
>
> Each port is a netdev and can be paired with using -netdev id=<port name>.
>
> Signed-off-by: Scott Feldman <sfeldma@gmail.com>
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---

Looks like the devices does not support state saving. How about tagging
it with unmigratable first?
>  default-configs/pci.mak       |    1 +
>  hw/net/Makefile.objs          |    3 +
>  hw/net/rocker/rocker.c        | 1395 +++++++++++++++++++++++++
>  hw/net/rocker/rocker.h        |   76 ++
>  hw/net/rocker/rocker_desc.c   |  379 +++++++
>  hw/net/rocker/rocker_desc.h   |   57 +
>  hw/net/rocker/rocker_fp.c     |  232 +++++
>  hw/net/rocker/rocker_fp.h     |   53 +
>  hw/net/rocker/rocker_hw.h     |  475 +++++++++
>  hw/net/rocker/rocker_of_dpa.c | 2314 +++++++++++++++++++++++++++++++++++++++++
>  hw/net/rocker/rocker_of_dpa.h |   25 +
>  hw/net/rocker/rocker_tlv.h    |  244 +++++
>  hw/net/rocker/rocker_world.c  |  108 ++
>  hw/net/rocker/rocker_world.h  |   63 ++
>  14 files changed, 5425 insertions(+)
>  create mode 100644 hw/net/rocker/rocker.c
>  create mode 100644 hw/net/rocker/rocker.h
>  create mode 100644 hw/net/rocker/rocker_desc.c
>  create mode 100644 hw/net/rocker/rocker_desc.h
>  create mode 100644 hw/net/rocker/rocker_fp.c
>  create mode 100644 hw/net/rocker/rocker_fp.h
>  create mode 100644 hw/net/rocker/rocker_hw.h
>  create mode 100644 hw/net/rocker/rocker_of_dpa.c
>  create mode 100644 hw/net/rocker/rocker_of_dpa.h
>  create mode 100644 hw/net/rocker/rocker_tlv.h
>  create mode 100644 hw/net/rocker/rocker_world.c
>  create mode 100644 hw/net/rocker/rocker_world.h
[...]
> +
> +typedef struct rocker {
> +    /* private */
> +    PCIDevice parent_obj;
> +    /* public */
> +
> +    MemoryRegion mmio;
> +    MemoryRegion msix_bar;
> +
> +    /* switch configuration */
> +    char *name;                  /* switch name */
> +    uint32_t fp_ports;           /* front-panel port count */
> +    NICPeers *fp_ports_peers;
> +    MACAddr fp_start_macaddr;    /* front-panel port 0 mac addr */
> +    uint64_t switch_id;          /* switch id */

Looks like the function of name ad switch_id are duplicated?
> +
> +    /* front-panel ports */
> +    FpPort *fp_port[ROCKER_FP_PORTS_MAX];
> +
> +    /* register backings */
> +    uint32_t test_reg;
> +    uint64_t test_reg64;
> +    dma_addr_t test_dma_addr;
> +    uint32_t test_dma_size;
> +
> +    /* desc rings */
> +    DescRing **rings;
> +
> +    /* switch worlds */
> +    World *worlds[ROCKER_WORLD_TYPE_MAX];
> +    World *world_dflt;
> +
> +    QLIST_ENTRY(rocker) next;
> +} Rocker;
> +
[...]
> +
> +static int tx_consume(Rocker *r, DescInfo *info)
> +{
> +    PCIDevice *dev = PCI_DEVICE(r);
> +    char *buf = desc_get_buf(info, true);
> +    RockerTlv *tlv_frag;
> +    RockerTlv *tlvs[ROCKER_TLV_TX_MAX + 1];
> +    struct iovec iov[ROCKER_TX_FRAGS_MAX] = { { 0, }, };
> +    uint32_t pport;
> +    uint32_t port;
> +    uint16_t tx_offload = ROCKER_TX_OFFLOAD_NONE;
> +    uint16_t tx_l3_csum_off = 0;
> +    uint16_t tx_tso_mss = 0;
> +    uint16_t tx_tso_hdr_len = 0;
> +    int iovcnt = 0;
> +    int err = 0;
> +    int rem;
> +    int i;
> +
> +    if (!buf) {
> +        return -ENXIO;
> +    }
> +
> +    rocker_tlv_parse(tlvs, ROCKER_TLV_TX_MAX, buf, desc_tlv_size(info));
> +
> +    if (!tlvs[ROCKER_TLV_TX_FRAGS]) {
> +        return -EINVAL;
> +    }
> +
> +    pport = rocker_get_pport_by_tx_ring(r, desc_get_ring(info));
> +    if (!fp_port_from_pport(pport, &port)) {
> +        return -EINVAL;
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_OFFLOAD]) {
> +        tx_offload = rocker_tlv_get_u8(tlvs[ROCKER_TLV_TX_OFFLOAD]);
> +    }
> +
> +    switch (tx_offload) {
> +    case ROCKER_TX_OFFLOAD_L3_CSUM:
> +        if (!tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> +            return -EINVAL;
> +        }
> +    case ROCKER_TX_OFFLOAD_TSO:
> +        if (!tlvs[ROCKER_TLV_TX_TSO_MSS] ||
> +            !tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> +            return -EINVAL;
> +        }
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) {
> +        tx_l3_csum_off = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]);
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_TSO_MSS]) {
> +        tx_tso_mss = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_MSS]);
> +    }
> +
> +    if (tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) {
> +        tx_tso_hdr_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]);
> +    }
> +
> +    rocker_tlv_for_each_nested(tlv_frag, tlvs[ROCKER_TLV_TX_FRAGS], rem) {
> +        hwaddr frag_addr;
> +        uint16_t frag_len;
> +
> +        if (rocker_tlv_type(tlv_frag) != ROCKER_TLV_TX_FRAG) {
> +            return -EINVAL;
> +        }
> +
> +        rocker_tlv_parse_nested(tlvs, ROCKER_TLV_TX_FRAG_ATTR_MAX, tlv_frag);
> +
> +        if (!tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR] ||
> +            !tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]) {
> +            return -EINVAL;
> +        }
> +
> +        frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_TX_FRAG_ATTR_ADDR]);
> +        frag_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_TX_FRAG_ATTR_LEN]);
> +
> +        iov[iovcnt].iov_len = frag_len;
> +        iov[iovcnt].iov_base = g_malloc(frag_len);
> +        if (!iov[iovcnt].iov_base) {
> +            err = -ENOMEM;
> +            goto err_no_mem;
> +        }
> +
> +        if (pci_dma_read(dev, frag_addr, iov[iovcnt].iov_base,
> +                     iov[iovcnt].iov_len)) {
> +            err = -ENXIO;
> +            goto err_bad_io;
> +        }
> +
> +        if (++iovcnt > ROCKER_TX_FRAGS_MAX) {
> +            goto err_too_many_frags;
> +        }
> +    }
> +
> +    if (iovcnt) {
> +        /* XXX perform Tx offloads */
> +        /* XXX   silence compiler for now */

Need add TSO here (e1000 is a good reference) or disable TSO in driver.
> +        tx_l3_csum_off += tx_tso_mss = tx_tso_hdr_len = 0;
> +    }
> +
> +    err = fp_port_eg(r->fp_port[port], iov, iovcnt);
> +
> +err_no_mem:
> +err_bad_io:
> +err_too_many_frags:
> +    for (i = 0; i < iovcnt; i++) {
> +        if (iov[i].iov_base) {
> +            g_free(iov[i].iov_base);
> +        }
> +    }
> +
> +    return err;
> +}
> +

[...]
> +
> +int rocker_event_link_changed(Rocker *r, uint32_t pport, bool link_up)
> +{
> +    DescRing *ring = r->rings[ROCKER_RING_EVENT];
> +    DescInfo *info = desc_ring_fetch_desc(ring);
> +    RockerTlv *nest;
> +    char *buf;
> +    size_t tlv_size;
> +    int pos;
> +    int err;
> +
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) +  /* event type */
> +               rocker_tlv_total_size(0) +                 /* nest */
> +               rocker_tlv_total_size(sizeof(uint32_t)) +  /*   pport */
> +               rocker_tlv_total_size(sizeof(uint8_t));    /*   link up */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto err_too_big;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENOMEM;
> +        goto err_no_mem;
> +    }
> +
> +    pos = 0;
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> +                        ROCKER_TLV_EVENT_TYPE_LINK_CHANGED);
> +    nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_PPORT, pport);
> +    rocker_tlv_put_u8(buf, &pos, ROCKER_TLV_EVENT_LINK_CHANGED_LINKUP,
> +                      link_up ? 1 : 0);

Looks like those types are not documented.
> +    rocker_tlv_nest_end(buf, &pos, nest);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> +    }
> +
> +    return err;
> +}
> +
> +int rocker_event_mac_vlan_seen(Rocker *r, uint32_t pport, uint8_t *addr,
> +                               uint16_t vlan_id)
> +{
> +    DescRing *ring = r->rings[ROCKER_RING_EVENT];
> +    DescInfo *info;
> +    FpPort *fp_port;
> +    uint32_t port;
> +    RockerTlv *nest;
> +    char *buf;
> +    size_t tlv_size;
> +    int pos;
> +    int err;
> +
> +    if (!fp_port_from_pport(pport, &port)) {
> +        return -EINVAL;
> +    }
> +    fp_port = r->fp_port[port];
> +    if (!fp_port_get_learning(fp_port)) {
> +        return 0;
> +    }
> +
> +    info = desc_ring_fetch_desc(ring);
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) +  /* event type */
> +               rocker_tlv_total_size(0) +                 /* nest */
> +               rocker_tlv_total_size(sizeof(uint32_t)) +  /*   pport */
> +               rocker_tlv_total_size(ETH_ALEN) +          /*   mac addr */
> +               rocker_tlv_total_size(sizeof(uint16_t));   /*   vlan_id */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto err_too_big;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENOMEM;
> +        goto err_no_mem;
> +    }
> +
> +    pos = 0;
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_TYPE,
> +                        ROCKER_TLV_EVENT_TYPE_MAC_VLAN_SEEN);
> +    nest = rocker_tlv_nest_start(buf, &pos, ROCKER_TLV_EVENT_INFO);
> +    rocker_tlv_put_le32(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_PPORT, pport);
> +    rocker_tlv_put(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_MAC, ETH_ALEN, addr);
> +    rocker_tlv_put_u16(buf, &pos, ROCKER_TLV_EVENT_MAC_VLAN_VLAN_ID, vlan_id);

Undocumented types.
> +    rocker_tlv_nest_end(buf, &pos, nest);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +err_too_big:
> +err_no_mem:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_EVENT);
> +    }
> +
> +    return err;
> +}
> +
> +static DescRing *rocker_get_rx_ring_by_pport(Rocker *r,
> +                                                     uint32_t pport)
> +{
> +    return r->rings[(pport - 1) * 2 + 3];
> +}
> +
> +int rx_produce(World *world, uint32_t pport,
> +               const struct iovec *iov, int iovcnt)
> +{
> +    Rocker *r = world_rocker(world);
> +    PCIDevice *dev = (PCIDevice *)r;
> +    DescRing *ring = rocker_get_rx_ring_by_pport(r, pport);
> +    DescInfo *info = desc_ring_fetch_desc(ring);
> +    char *data;
> +    size_t data_size = iov_size(iov, iovcnt);
> +    char *buf;
> +    uint16_t rx_flags = 0;
> +    uint16_t rx_csum = 0;
> +    size_t tlv_size;
> +    RockerTlv *tlvs[ROCKER_TLV_RX_MAX + 1];
> +    hwaddr frag_addr;
> +    uint16_t frag_max_len;
> +    int pos;
> +    int err;
> +
> +    if (!info) {
> +        return -ENOBUFS;
> +    }
> +
> +    buf = desc_get_buf(info, false);
> +    if (!buf) {
> +        err = -ENXIO;
> +        goto out;
> +    }
> +    rocker_tlv_parse(tlvs, ROCKER_TLV_RX_MAX, buf, desc_tlv_size(info));
> +
> +    if (!tlvs[ROCKER_TLV_RX_FRAG_ADDR] ||
> +        !tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]) {

This conflicts with the spec which only describes RX_PACKET.
> +        err = -EINVAL;
> +        goto out;
> +    }
> +
> +    frag_addr = rocker_tlv_get_le64(tlvs[ROCKER_TLV_RX_FRAG_ADDR]);
> +    frag_max_len = rocker_tlv_get_le16(tlvs[ROCKER_TLV_RX_FRAG_MAX_LEN]);
> +
> +    if (data_size > frag_max_len) {
> +        err = -EMSGSIZE;
> +        goto out;
> +    }
> +
> +    /* XXX calc rx flags/csum */
> +
> +    tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* flags */
> +               rocker_tlv_total_size(sizeof(uint16_t)) + /* scum */
> +               rocker_tlv_total_size(sizeof(uint64_t)) + /* frag addr */
> +               rocker_tlv_total_size(sizeof(uint16_t)) + /* frag max len */
> +               rocker_tlv_total_size(sizeof(uint16_t));  /* frag len */
> +
> +    if (tlv_size > desc_buf_size(info)) {
> +        err = -EMSGSIZE;
> +        goto out;
> +    }
> +
> +    /* TODO:
> +     * iov dma write can be optimized in similar way e1000 does it in
> +     * e1000_receive_iov. But maybe if would make sense to introduce
> +     * generic helper iov_dma_write.
> +     */
> +
> +    data = g_malloc(data_size);
> +    if (!data) {
> +        err = -ENOMEM;
> +        goto out;
> +    }
> +    iov_to_buf(iov, iovcnt, 0, data, data_size);
> +    pci_dma_write(dev, frag_addr, data, data_size);
> +    g_free(data);
> +
> +    pos = 0;
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FLAGS, rx_flags);
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_CSUM, rx_csum);

Note: Some backend (e.g tap) can do offloading, may consider to add the
support in the future.
> +    rocker_tlv_put_le64(buf, &pos, ROCKER_TLV_RX_FRAG_ADDR, frag_addr);
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_MAX_LEN, frag_max_len);

The above 2 types were not documented. Please make sure all types were
documented.
> +    rocker_tlv_put_le16(buf, &pos, ROCKER_TLV_RX_FRAG_LEN, data_size);
> +
> +    err = desc_set_buf(info, tlv_size);
> +
> +out:
> +    if (desc_ring_post_desc(ring, err)) {
> +        rocker_msix_irq(r, ROCKER_MSIX_VEC_RX(pport - 1));
> +    }
> +
> +    return err;
> +}
> +
[...]
> +
> +bool fp_port_from_pport(uint32_t pport, uint32_t *port)
> +{
> +    if (pport < 1 || pport > ROCKER_FP_PORTS_MAX) {
> +        return false;
> +    }
> +    *port = pport - 1;
> +    return true;
> +}
> +
> +int fp_port_eg(FpPort *port, const struct iovec *iov, int iovcnt)
> +{
> +    NetClientState *nc = qemu_get_queue(port->nic);

Using only 1 queues is ok for multiqueue backend. In the future may
consider to use all.
> +
> +    if (port->enabled) {
> +        qemu_sendv_packet(nc, iov, iovcnt);
> +    }
> +
> +    return 0;
> +}
> +
[...]

Thanks

  parent reply	other threads:[~2015-01-16  9:16 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-11  3:57 [Qemu-devel] [PATCH v3 0/9] rocker: add new rocker ethernet switch device sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 1/9] net: add MAC address string printer sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 2/9] virtio-net: use qemu_mac_strdup_printf sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 3/9] rocker: add register programming guide sfeldma
2015-01-12 11:40   ` Paolo Bonzini
2015-01-16  8:14     ` Scott Feldman
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 4/9] pci: add rocker device ID sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 5/9] pci: add network device class 'other' for network switches sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 6/9] rocker: add new rocker switch device sfeldma
2015-01-12 12:57   ` Paolo Bonzini
2015-01-14 23:20     ` Scott Feldman
2015-01-15  9:08       ` Paolo Bonzini
2015-01-16  9:15   ` Jason Wang [this message]
2015-01-16  9:48     ` Scott Feldman
2015-01-21  3:36       ` Jason Wang
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 7/9] qmp: add rocker device support sfeldma
2015-01-16  9:26   ` Jason Wang
2015-01-16  9:59     ` Scott Feldman
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 8/9] rocker: add tests sfeldma
2015-01-11  3:57 ` [Qemu-devel] [PATCH v3 9/9] MAINTAINERS: add rocker sfeldma

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=54B8D6C8.6090405@redhat.com \
    --to=jasowang@redhat.com \
    --cc=eblake@redhat.com \
    --cc=jiri@resnulli.us \
    --cc=john.fastabend@gmail.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=roopa@cumulusnetworks.com \
    --cc=sfeldma@gmail.com \
    --cc=stefanha@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.