* Re: [net-next rfc v7 2/3] virtio_net: multiqueue support
From: Michael S. Tsirkin @ 2012-12-02 16:06 UTC (permalink / raw)
To: Jason Wang
Cc: rusty, krkumar2, virtualization, netdev, linux-kernel, kvm,
bhutchings, jwhan, shiyer
In-Reply-To: <1354011360-39479-3-git-send-email-jasowang@redhat.com>
On Tue, Nov 27, 2012 at 06:15:59PM +0800, Jason Wang wrote:
> This addes multiqueue support to virtio_net driver. In multiple queue modes, the
> driver expects the number of queue paris is equal to the number of vcpus. To
> eliminate the contention bettwen vcpus and virtqueues, per-cpu virtqueue pairs
> were implemented through:
>
> - select the txq based on the smp processor id.
> - smp affinity hint were set to the vcpu that owns the queue pairs.
>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/virtio_net.c | 454 ++++++++++++++++++++++++++++++---------
> include/uapi/linux/virtio_net.h | 16 ++
> 2 files changed, 371 insertions(+), 99 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7975133..bcaa6e5 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -84,17 +84,25 @@ struct virtnet_info {
> struct virtio_device *vdev;
> struct virtqueue *cvq;
> struct net_device *dev;
> - struct napi_struct napi;
> - struct send_queue sq;
> - struct receive_queue rq;
> + struct send_queue *sq;
> + struct receive_queue *rq;
> unsigned int status;
>
> + /* Max # of queue pairs supported by the device */
> + u16 max_queue_pairs;
> +
> + /* # of queue pairs currently used by the driver */
> + u16 curr_queue_pairs;
> +
> /* I like... big packets and I cannot lie! */
> bool big_packets;
>
> /* Host will merge rx buffers for big packets (shake it! shake it!) */
> bool mergeable_rx_bufs;
>
> + /* Has control virtqueue */
> + bool has_cvq;
> +
> /* enable config space updates */
> bool config_enable;
>
> @@ -126,6 +134,34 @@ struct padded_vnet_hdr {
> char padding[6];
> };
>
> +static const struct ethtool_ops virtnet_ethtool_ops;
> +
> +/*
> + * Converting between virtqueue no. and kernel tx/rx queue no.
> + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
> + */
Weird, this isn't what spec v5 says: it says
0:rx0 1:tx0 2: rx1 3: tx1 .... vcq
We can change the spec to match but keeping all rx/tx
together seems a bit prettier?
> +static int vq2txq(struct virtqueue *vq)
> +{
> + int index = virtqueue_get_queue_index(vq);
> + return index == 1 ? 0 : (index - 2) / 2;
> +}
> +
> +static int txq2vq(int txq)
> +{
> + return txq ? 2 * txq + 2 : 1;
> +}
> +
> +static int vq2rxq(struct virtqueue *vq)
> +{
> + int index = virtqueue_get_queue_index(vq);
> + return index ? (index - 1) / 2 : 0;
> +}
> +
> +static int rxq2vq(int rxq)
> +{
> + return rxq ? 2 * rxq + 1 : 0;
> +}
> +
> static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
> {
> return (struct skb_vnet_hdr *)skb->cb;
> @@ -166,7 +202,7 @@ static void skb_xmit_done(struct virtqueue *vq)
> virtqueue_disable_cb(vq);
>
> /* We were probably waiting for more output buffers. */
> - netif_wake_queue(vi->dev);
> + netif_wake_subqueue(vi->dev, vq2txq(vq));
> }
>
> static void set_skb_frag(struct sk_buff *skb, struct page *page,
> @@ -503,7 +539,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
> static void skb_recv_done(struct virtqueue *rvq)
> {
> struct virtnet_info *vi = rvq->vdev->priv;
> - struct receive_queue *rq = &vi->rq;
> + struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
>
> /* Schedule NAPI, Suppress further interrupts if successful. */
> if (napi_schedule_prep(&rq->napi)) {
> @@ -650,7 +686,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
> static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> - struct send_queue *sq = &vi->sq;
> + int qnum = skb_get_queue_mapping(skb);
> + struct send_queue *sq = &vi->sq[qnum];
> int capacity;
>
> /* Free up any pending old buffers before queueing new ones. */
> @@ -664,13 +701,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> if (likely(capacity == -ENOMEM)) {
> if (net_ratelimit())
> dev_warn(&dev->dev,
> - "TX queue failure: out of memory\n");
> + "TXQ (%d) failure: out of memory\n",
> + qnum);
> } else {
> dev->stats.tx_fifo_errors++;
> if (net_ratelimit())
> dev_warn(&dev->dev,
> - "Unexpected TX queue failure: %d\n",
> - capacity);
> + "Unexpected TXQ (%d) failure: %d\n",
> + qnum, capacity);
> }
> dev->stats.tx_dropped++;
> kfree_skb(skb);
> @@ -685,12 +723,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> /* Apparently nice girls don't return TX_BUSY; stop the queue
> * before it gets out of hand. Naturally, this wastes entries. */
> if (capacity < 2+MAX_SKB_FRAGS) {
> - netif_stop_queue(dev);
> + netif_stop_subqueue(dev, qnum);
> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
> /* More just got used, free them then recheck. */
> capacity += free_old_xmit_skbs(sq);
> if (capacity >= 2+MAX_SKB_FRAGS) {
> - netif_start_queue(dev);
> + netif_start_subqueue(dev, qnum);
> virtqueue_disable_cb(sq->vq);
> }
> }
> @@ -758,23 +796,13 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
> static void virtnet_netpoll(struct net_device *dev)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> + int i;
>
> - napi_schedule(&vi->rq.napi);
> + for (i = 0; i < vi->curr_queue_pairs; i++)
> + napi_schedule(&vi->rq[i].napi);
> }
> #endif
>
> -static int virtnet_open(struct net_device *dev)
> -{
> - struct virtnet_info *vi = netdev_priv(dev);
> -
> - /* Make sure we have some buffers: if oom use wq. */
> - if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> - schedule_delayed_work(&vi->rq.refill, 0);
> -
> - virtnet_napi_enable(&vi->rq);
> - return 0;
> -}
> -
> /*
> * Send command via the control virtqueue and check status. Commands
> * supported by the hypervisor, as indicated by feature bits, should
> @@ -830,13 +858,53 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
> rtnl_unlock();
> }
>
> +static int virtnet_set_queues(struct virtnet_info *vi)
> +{
> + struct scatterlist sg;
> + struct virtio_net_ctrl_rfs s;
> + struct net_device *dev = vi->dev;
> +
> + s.virtqueue_pairs = vi->curr_queue_pairs;
> + sg_init_one(&sg, &s, sizeof(s));
> +
> + if (!vi->has_cvq)
> + return -EINVAL;
> +
> + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
> + VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
> + dev_warn(&dev->dev, "Fail to set the number of queue pairs to"
> + " %d\n", vi->curr_queue_pairs);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static int virtnet_open(struct net_device *dev)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + int i;
> +
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + /* Make sure we have some buffers: if oom use wq. */
> + if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> + schedule_delayed_work(&vi->rq[i].refill, 0);
> + virtnet_napi_enable(&vi->rq[i]);
> + }
> +
> + return 0;
> +}
> +
> static int virtnet_close(struct net_device *dev)
> {
> struct virtnet_info *vi = netdev_priv(dev);
> + int i;
>
> /* Make sure refill_work doesn't re-enable napi! */
> - cancel_delayed_work_sync(&vi->rq.refill);
> - napi_disable(&vi->rq.napi);
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> + napi_disable(&vi->rq[i].napi);
> + }
>
> return 0;
> }
> @@ -948,8 +1016,8 @@ static void virtnet_get_ringparam(struct net_device *dev,
> {
> struct virtnet_info *vi = netdev_priv(dev);
>
> - ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
> - ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
> + ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
> + ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
> ring->rx_pending = ring->rx_max_pending;
> ring->tx_pending = ring->tx_max_pending;
> }
> @@ -967,12 +1035,6 @@ static void virtnet_get_drvinfo(struct net_device *dev,
>
> }
>
> -static const struct ethtool_ops virtnet_ethtool_ops = {
> - .get_drvinfo = virtnet_get_drvinfo,
> - .get_link = ethtool_op_get_link,
> - .get_ringparam = virtnet_get_ringparam,
> -};
> -
> #define MIN_MTU 68
> #define MAX_MTU 65535
>
> @@ -984,6 +1046,20 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
> return 0;
> }
>
> +/* To avoid contending a lock hold by a vcpu who would exit to host, select the
> + * txq based on the processor id.
> + */
> +static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
> +{
> + int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
> + smp_processor_id();
> +
> + while (unlikely(txq >= dev->real_num_tx_queues))
> + txq -= dev->real_num_tx_queues;
> +
> + return txq;
> +}
> +
> static const struct net_device_ops virtnet_netdev = {
> .ndo_open = virtnet_open,
> .ndo_stop = virtnet_close,
> @@ -995,6 +1071,7 @@ static const struct net_device_ops virtnet_netdev = {
> .ndo_get_stats64 = virtnet_stats,
> .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
> .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> + .ndo_select_queue = virtnet_select_queue,
> #ifdef CONFIG_NET_POLL_CONTROLLER
> .ndo_poll_controller = virtnet_netpoll,
> #endif
> @@ -1030,10 +1107,10 @@ static void virtnet_config_changed_work(struct work_struct *work)
>
> if (vi->status & VIRTIO_NET_S_LINK_UP) {
> netif_carrier_on(vi->dev);
> - netif_wake_queue(vi->dev);
> + netif_tx_wake_all_queues(vi->dev);
> } else {
> netif_carrier_off(vi->dev);
> - netif_stop_queue(vi->dev);
> + netif_tx_stop_all_queues(vi->dev);
> }
> done:
> mutex_unlock(&vi->config_lock);
> @@ -1046,41 +1123,212 @@ static void virtnet_config_changed(struct virtio_device *vdev)
> schedule_work(&vi->config_work);
> }
>
> -static int init_vqs(struct virtnet_info *vi)
> +static void free_receive_bufs(struct virtnet_info *vi)
> +{
> + int i;
> +
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + while (vi->rq[i].pages)
> + __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
> + }
> +}
> +
> +/* Free memory allocated for send and receive queues */
> +static void virtnet_free_queues(struct virtnet_info *vi)
> {
> - struct virtqueue *vqs[3];
> - vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
> - const char *names[] = { "input", "output", "control" };
> - int nvqs, err;
> + kfree(vi->rq);
> + vi->rq = NULL;
> + kfree(vi->sq);
> + vi->sq = NULL;
> +}
>
> - /* We expect two virtqueues, receive then send,
> - * and optionally control. */
> - nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
> +static void free_unused_bufs(struct virtnet_info *vi)
> +{
> + void *buf;
> + int i;
>
> - err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
> - if (err)
> - return err;
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + struct virtqueue *vq = vi->sq[i].vq;
> + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> + dev_kfree_skb(buf);
> + }
>
> - vi->rq.vq = vqs[0];
> - vi->sq.vq = vqs[1];
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + struct virtqueue *vq = vi->rq[i].vq;
>
> - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
> - vi->cvq = vqs[2];
> + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
> + if (vi->mergeable_rx_bufs || vi->big_packets)
> + give_pages(&vi->rq[i], buf);
> + else
> + dev_kfree_skb(buf);
> + --vi->rq[i].num;
> + }
> + BUG_ON(vi->rq[i].num != 0);
> + }
> +}
>
> +static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
> +{
> + int i;
> +
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + int cpu = set ? i : -1;
> + virtqueue_set_affinity(vi->rq[i].vq, cpu);
> + virtqueue_set_affinity(vi->sq[i].vq, cpu);
> + }
> +}
> +
> +static void virtnet_del_vqs(struct virtnet_info *vi)
> +{
> + struct virtio_device *vdev = vi->vdev;
> +
> + virtnet_set_affinity(vi, false);
> +
> + vdev->config->del_vqs(vdev);
> +
> + virtnet_free_queues(vi);
> +}
> +
> +static int virtnet_find_vqs(struct virtnet_info *vi)
> +{
> + vq_callback_t **callbacks;
> + struct virtqueue **vqs;
> + int ret = -ENOMEM;
> + int i, total_vqs;
> + char **names;
> +
> + /*
> + * We expect 1 RX virtqueue followed by 1 TX virtqueue, followd by
> + * possible control virtqueue, followed by RX/TX N-1 queue pairs used
> + * in multiqueue mode.
> + */
> + total_vqs = vi->max_queue_pairs * 2 +
> + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
> +
> + /* Allocate space for find_vqs parameters */
> + vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
> + callbacks = kzalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
> + if (!vqs || !callbacks)
> + goto err_mem;
> + names = kzalloc(total_vqs * sizeof(*names), GFP_KERNEL);
> + if (!names)
> + goto err_mem;
> +
> + /* Parameters for control virtqueue, if any */
> + if (vi->has_cvq) {
> + callbacks[2] = NULL;
> + names[2] = kasprintf(GFP_KERNEL, "control");
> + }
> +
> + /* Allocate/initialize parameters for send/receive virtqueues */
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + callbacks[rxq2vq(i)] = skb_recv_done;
> + callbacks[txq2vq(i)] = skb_xmit_done;
> + names[rxq2vq(i)] = kasprintf(GFP_KERNEL, "input.%d", i);
> + names[txq2vq(i)] = kasprintf(GFP_KERNEL, "output.%d", i);
> + }
> +
> + ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
> + (const char **)names);
> + if (ret)
> + goto err_names;
> +
> + if (vi->has_cvq) {
> + vi->cvq = vqs[2];
> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
> vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
> }
> +
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + vi->rq[i].vq = vqs[rxq2vq(i)];
> + vi->sq[i].vq = vqs[txq2vq(i)];
> + }
> +
> + kfree(callbacks);
> + kfree(vqs);
> +
> + return 0;
> +
> +err_names:
> + for (i = 0; i < total_vqs * 2; i ++)
> + kfree(names[i]);
> + kfree(names);
> +
> +err_mem:
> + kfree(callbacks);
> + kfree(vqs);
> +
> + return ret;
> +}
> +
> +static int virtnet_alloc_queues(struct virtnet_info *vi)
> +{
> + int i;
> +
> + vi->sq = kzalloc(sizeof(vi->sq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> + vi->rq = kzalloc(sizeof(vi->rq[0]) * vi->max_queue_pairs, GFP_KERNEL);
> + if (!vi->rq || !vi->sq)
> + goto err;
> +
> + /* setup initial receive and send queue parameters */
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + vi->rq[i].pages = NULL;
> + INIT_DELAYED_WORK(&vi->rq[i].refill, refill_work);
> + netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
> + napi_weight);
> +
> + sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> + sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
> + }
> +
> +
> return 0;
> +
> +err:
> + virtnet_free_queues(vi);
> + return -ENOMEM;
> +}
> +
> +static int init_vqs(struct virtnet_info *vi)
> +{
> + int ret;
> +
> + /* Allocate send & receive queues */
> + ret = virtnet_alloc_queues(vi);
> + if (ret)
> + goto err;
> +
> + ret = virtnet_find_vqs(vi);
> + if (ret)
> + goto err_free;
> +
> + virtnet_set_affinity(vi, true);
> + return 0;
> +
> +err_free:
> + virtnet_free_queues(vi);
> +err:
> + return ret;
> }
>
> static int virtnet_probe(struct virtio_device *vdev)
> {
> - int err;
> + int i, err;
> struct net_device *dev;
> struct virtnet_info *vi;
> + u16 curr_queue_pairs;
Probably a good idea to rename this max_queue_pairs.
> +
> + /* Find if host supports multiqueue virtio_net device */
> + err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
> + offsetof(struct virtio_net_config,
> + max_virtqueue_pairs), &curr_queue_pairs);
> +
> + /* We need at least 2 queue's */
> + if (err)
> + curr_queue_pairs = 1;
Let's also validate against VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN
and VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX.
>
> /* Allocate ourselves a network device with room for our info */
> - dev = alloc_etherdev(sizeof(struct virtnet_info));
> + dev = alloc_etherdev_mq(sizeof(struct virtnet_info), curr_queue_pairs);
> if (!dev)
> return -ENOMEM;
>
> @@ -1126,22 +1374,17 @@ static int virtnet_probe(struct virtio_device *vdev)
>
> /* Set up our device-specific information */
> vi = netdev_priv(dev);
> - netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
> vi->dev = dev;
> vi->vdev = vdev;
> vdev->priv = vi;
> - vi->rq.pages = NULL;
> vi->stats = alloc_percpu(struct virtnet_stats);
> err = -ENOMEM;
> if (vi->stats == NULL)
> goto free;
>
> - INIT_DELAYED_WORK(&vi->rq.refill, refill_work);
> mutex_init(&vi->config_lock);
> vi->config_enable = true;
> INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> - sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
> - sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
>
> /* If we can receive ANY GSO packets, we must allocate large ones. */
> if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -1152,10 +1395,21 @@ static int virtnet_probe(struct virtio_device *vdev)
> if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
> vi->mergeable_rx_bufs = true;
>
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
> + vi->has_cvq = true;
> +
> + /* Use single tx/rx queue pair as default */
> + vi->curr_queue_pairs = 1;
> + vi->max_queue_pairs = curr_queue_pairs;
> +
> + /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
> err = init_vqs(vi);
> if (err)
> goto free_stats;
>
> + netif_set_real_num_tx_queues(dev, 1);
> + netif_set_real_num_rx_queues(dev, 1);
> +
> err = register_netdev(dev);
> if (err) {
> pr_debug("virtio_net: registering device failed\n");
> @@ -1163,12 +1417,15 @@ static int virtnet_probe(struct virtio_device *vdev)
> }
>
> /* Last of all, set up some receive buffers. */
> - try_fill_recv(&vi->rq, GFP_KERNEL);
> -
> - /* If we didn't even get one input buffer, we're useless. */
> - if (vi->rq.num == 0) {
> - err = -ENOMEM;
> - goto unregister;
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + try_fill_recv(&vi->rq[i], GFP_KERNEL);
> +
> + /* If we didn't even get one input buffer, we're useless. */
> + if (vi->rq[i].num == 0) {
> + free_unused_bufs(vi);
> + err = -ENOMEM;
> + goto free_recv_bufs;
> + }
> }
>
> /* Assume link up if device can't report link status,
> @@ -1181,13 +1438,20 @@ static int virtnet_probe(struct virtio_device *vdev)
> netif_carrier_on(dev);
> }
>
> - pr_debug("virtnet: registered device %s\n", dev->name);
> + pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
> + dev->name, curr_queue_pairs);
> +
> return 0;
>
> -unregister:
> +free_recv_bufs:
> + free_receive_bufs(vi);
> unregister_netdev(dev);
> +
> free_vqs:
> - vdev->config->del_vqs(vdev);
> + for (i = 0; i <curr_queue_pairs; i++)
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> + virtnet_del_vqs(vi);
> +
> free_stats:
> free_percpu(vi->stats);
> free:
> @@ -1195,28 +1459,6 @@ free:
> return err;
> }
>
> -static void free_unused_bufs(struct virtnet_info *vi)
> -{
> - void *buf;
> - while (1) {
> - buf = virtqueue_detach_unused_buf(vi->sq.vq);
> - if (!buf)
> - break;
> - dev_kfree_skb(buf);
> - }
> - while (1) {
> - buf = virtqueue_detach_unused_buf(vi->rq.vq);
> - if (!buf)
> - break;
> - if (vi->mergeable_rx_bufs || vi->big_packets)
> - give_pages(&vi->rq, buf);
> - else
> - dev_kfree_skb(buf);
> - --vi->rq.num;
> - }
> - BUG_ON(vi->rq.num != 0);
> -}
> -
> static void remove_vq_common(struct virtnet_info *vi)
> {
> vi->vdev->config->reset(vi->vdev);
> @@ -1224,10 +1466,9 @@ static void remove_vq_common(struct virtnet_info *vi)
> /* Free unused buffers in both send and recv, if any. */
> free_unused_bufs(vi);
>
> - vi->vdev->config->del_vqs(vi->vdev);
> + free_receive_bufs(vi);
>
> - while (vi->rq.pages)
> - __free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
> + virtnet_del_vqs(vi);
> }
>
> static void __devexit virtnet_remove(struct virtio_device *vdev)
> @@ -1253,6 +1494,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
> static int virtnet_freeze(struct virtio_device *vdev)
> {
> struct virtnet_info *vi = vdev->priv;
> + int i;
>
> /* Prevent config work handler from accessing the device */
> mutex_lock(&vi->config_lock);
> @@ -1260,10 +1502,14 @@ static int virtnet_freeze(struct virtio_device *vdev)
> mutex_unlock(&vi->config_lock);
>
> netif_device_detach(vi->dev);
> - cancel_delayed_work_sync(&vi->rq.refill);
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + cancel_delayed_work_sync(&vi->rq[i].refill);
>
> if (netif_running(vi->dev))
> - napi_disable(&vi->rq.napi);
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + napi_disable(&vi->rq[i].napi);
> + netif_napi_del(&vi->rq[i].napi);
> + }
>
> remove_vq_common(vi);
>
> @@ -1275,24 +1521,28 @@ static int virtnet_freeze(struct virtio_device *vdev)
> static int virtnet_restore(struct virtio_device *vdev)
> {
> struct virtnet_info *vi = vdev->priv;
> - int err;
> + int err, i;
>
> err = init_vqs(vi);
> if (err)
> return err;
>
> if (netif_running(vi->dev))
> - virtnet_napi_enable(&vi->rq);
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + virtnet_napi_enable(&vi->rq[i]);
>
> netif_device_attach(vi->dev);
>
> - if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> - schedule_delayed_work(&vi->rq.refill, 0);
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> + schedule_delayed_work(&vi->rq[i].refill, 0);
>
> mutex_lock(&vi->config_lock);
> vi->config_enable = true;
> mutex_unlock(&vi->config_lock);
>
> + BUG_ON(virtnet_set_queues(vi));
> +
Won't this always fail when control vq is off?
> return 0;
> }
> #endif
> @@ -1310,7 +1560,7 @@ static unsigned int features[] = {
> VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
> VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
> VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
> - VIRTIO_NET_F_GUEST_ANNOUNCE,
> + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_RFS,
> };
>
> static struct virtio_driver virtio_net_driver = {
> @@ -1328,6 +1578,12 @@ static struct virtio_driver virtio_net_driver = {
> #endif
> };
>
> +static const struct ethtool_ops virtnet_ethtool_ops = {
> + .get_drvinfo = virtnet_get_drvinfo,
> + .get_link = ethtool_op_get_link,
> + .get_ringparam = virtnet_get_ringparam,
> +};
> +
> static int __init init(void)
> {
> return register_virtio_driver(&virtio_net_driver);
> diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
> index 2470f54..6056cec 100644
> --- a/include/uapi/linux/virtio_net.h
> +++ b/include/uapi/linux/virtio_net.h
> @@ -51,6 +51,7 @@
> #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */
> #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 /* Guest can announce device on the
> * network */
> +#define VIRTIO_NET_F_RFS 22 /* Device supports multiple TXQ/RXQ */
>
> #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
> #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */
> @@ -60,6 +61,8 @@ struct virtio_net_config {
> __u8 mac[6];
> /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
> __u16 status;
> + /* Total number of RX/TX queues */
> + __u16 max_virtqueue_pairs;
> } __attribute__((packed));
>
> /* This is the first element of the scatter-gather list. If you don't
> @@ -166,4 +169,17 @@ struct virtio_net_ctrl_mac {
> #define VIRTIO_NET_CTRL_ANNOUNCE 3
> #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0
>
> +/*
> + * Control multiqueue
> + *
> + */
> +struct virtio_net_ctrl_rfs {
> + u16 virtqueue_pairs;
> +};
> +
> +#define VIRTIO_NET_CTRL_RFS 4
> + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET 0
> + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN 1
> + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX 0x8000
> +
> #endif /* _LINUX_VIRTIO_NET_H */
> --
> 1.7.1
^ permalink raw reply
* Re: [net-next rfc v7 3/3] virtio-net: change the number of queues through ethtool
From: Michael S. Tsirkin @ 2012-12-02 16:09 UTC (permalink / raw)
To: Jason Wang
Cc: krkumar2, kvm, netdev, linux-kernel, virtualization, bhutchings,
jwhan, shiyer
In-Reply-To: <1354011360-39479-4-git-send-email-jasowang@redhat.com>
On Tue, Nov 27, 2012 at 06:16:00PM +0800, Jason Wang wrote:
> This patch implement the {set|get}_channels method of ethool to allow user to
> change the number of queues dymaically when the device is running. This would
> let the user to configure it on demand.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/virtio_net.c | 41 +++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 41 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index bcaa6e5..f08ec2a 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1578,10 +1578,51 @@ static struct virtio_driver virtio_net_driver = {
> #endif
> };
>
> +/* TODO: Eliminate OOO packets during switching */
> +static int virtnet_set_channels(struct net_device *dev,
> + struct ethtool_channels *channels)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> + u16 queue_pairs = channels->combined_count;
> +
> + /* We don't support separate rx/tx channels.
> + * We don't allow setting 'other' channels.
> + */
> + if (channels->rx_count || channels->tx_count || channels->other_count)
> + return -EINVAL;
> +
> + /* Only two modes were support currently */
> + if (queue_pairs != vi->max_queue_pairs && queue_pairs != 1)
> + return -EINVAL;
> +
Why the limitation?
Also how does userspace discover what the legal values are?
> + vi->curr_queue_pairs = queue_pairs;
> + BUG_ON(virtnet_set_queues(vi));
> +
> + netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
> + netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
> +
> + return 0;
> +}
> +
> +static void virtnet_get_channels(struct net_device *dev,
> + struct ethtool_channels *channels)
> +{
> + struct virtnet_info *vi = netdev_priv(dev);
> +
> + channels->combined_count = vi->curr_queue_pairs;
> + channels->max_combined = vi->max_queue_pairs;
> + channels->max_other = 0;
> + channels->rx_count = 0;
> + channels->tx_count = 0;
> + channels->other_count = 0;
> +}
> +
> static const struct ethtool_ops virtnet_ethtool_ops = {
> .get_drvinfo = virtnet_get_drvinfo,
> .get_link = ethtool_op_get_link,
> .get_ringparam = virtnet_get_ringparam,
> + .set_channels = virtnet_set_channels,
> + .get_channels = virtnet_get_channels,
> };
>
> static int __init init(void)
> --
> 1.7.1
^ permalink raw reply
* Re: [PATCH v3]realtek:r8169: Bugfix or workaround for missing extended GigaMAC registers settings
From: Wang YanQing @ 2012-12-02 16:34 UTC (permalink / raw)
To: Francois Romieu; +Cc: nic_swsd, netdev, linux-kernel
In-Reply-To: <20121201114401.GA3989@electric-eye.fr.zoreil.com>
On Sat, Dec 01, 2012 at 12:44:01PM +0100, Francois Romieu wrote:
> Wang YanQing <udknight@gmail.com> :
> > + /*
> > + *This is a fix for BIOS forget to set
> > + *extend GigaMAC registers
> > + *Wang YanQing 12/1/2012
> > + */
>
> This part will go into the changelog.
I think brevity comment in code is good for
code's readableness. We read out the MAC{0,4},
and write them back in next line to call rtl_rar_set,
it don't have obvious sense for new readers, so I think
the brevity comment is good. Could you consider remaining
the comment except the no sense line "Wang YanQing 12/1/2012"?
>
> > + if (tp->mac_version == RTL_GIGA_MAC_VER_34) {
> > + rtl_rar_set(tp, dev->dev_addr);
> > + }
>
> rtl_rar_set already includes a RTL_GIGA_MAC_VER_34 test and non-8168evl
> devices are already able to stand an extra MAC{0, 4} write. I'll check
> it does not hurt on different 81xx devices and submit an update.
I add the test code to ignore the an extra MAC{0,4} write for non-8168evl
devices, and if you think it is not a issue, then I agree with you to remove
the test code.
Thanks.
^ permalink raw reply
* Re: [PATCH RFC] [INET]: Get cirtical word in first 64bit of cache line
From: Eric Dumazet @ 2012-12-02 17:20 UTC (permalink / raw)
To: Ling Ma; +Cc: linux-kernel, netdev
In-Reply-To: <CAOGi=dO2wZESX5o4Jr_XZu0oPM-Qe30DKt-4f_3TVBrVoR=12Q@mail.gmail.com>
On Sun, 2012-12-02 at 21:25 +0800, Ling Ma wrote:
> Hi Eric,
>
> Attached benchmark test-cwf.c(cc -o test-cwf test-cwf.c), the result
> shows when last level cache(LLC) miss and CPU fetches data from
> memory, critical word as first 64bit member in cache line has better
> performance(costs 158290336 cycles ) than other positions(offset 0x10,
> costs 164100732 ) in cache line, the performance is improved by 3.6%
> in this case.
> cpu-info is also involved too.
>
> Thanks
> Ling
Thanks Ling.
Note that I was more interested by the case we read more fields per
cache line, like we do in tcp lookups. (skc_daddr, skc_rcv_saddr,
skc_bound_dev_if, skc_net).
I made changes to net-next to prepare your patch.
You'll have to move both skc_rxhash & skc_portpair before the
skc_addrpair.
I have to fix an endianness sparse problem, I'll send a patch for this
in a separate thread right now.
^ permalink raw reply
* [PATCH net-next] net: fix sparse endianness warnings on sock_common
From: Eric Dumazet @ 2012-12-02 17:33 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Fengguang Wu, Ling Ma
From: Eric Dumazet <edumazet@google.com>
# make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/inet_hashtables.o
...
net/ipv4/inet_hashtables.c:242:7: warning: restricted __portpair degrades to integer
net/ipv4/inet_hashtables.c:242:7: warning: restricted __addrpair degrades to integer
...
Move __portpair/__addrpair from include/net/inet_hashtables.h
to include/net/sock.h where we need them in struct sock_common
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ling Ma <ling.ma.program@gmail.com>
---
include/net/inet_hashtables.h | 2 --
include/net/sock.h | 7 +++++--
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index d1de4fb..67a8fa0 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -277,7 +277,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
On 64bit targets we combine comparisons with pair of adjacent __be32
fields in the same way.
*/
-typedef __u32 __bitwise __portpair;
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
@@ -287,7 +286,6 @@ typedef __u32 __bitwise __portpair;
#endif
#if (BITS_PER_LONG == 64)
-typedef __u64 __bitwise __addrpair;
#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const __addrpair __name = (__force __addrpair) ( \
diff --git a/include/net/sock.h b/include/net/sock.h
index c4132c1..0a9a01a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -126,6 +126,9 @@ struct sock;
struct proto;
struct net;
+typedef __u32 __bitwise __portpair;
+typedef __u64 __bitwise __addrpair;
+
/**
* struct sock_common - minimal network layer representation of sockets
* @skc_daddr: Foreign IPv4 addr
@@ -155,7 +158,7 @@ struct sock_common {
* address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH()
*/
union {
- unsigned long skc_addrpair;
+ __addrpair skc_addrpair;
struct {
__be32 skc_daddr;
__be32 skc_rcv_saddr;
@@ -167,7 +170,7 @@ struct sock_common {
};
/* skc_dport && skc_num must be grouped as well */
union {
- u32 skc_portpair;
+ __portpair skc_portpair;
struct {
__be16 skc_dport;
__u16 skc_num;
^ permalink raw reply related
* Checking struct size against sizeof(skb->cb) (was Re: [PATCH 00/17] ATM fixes for pppoatm/br2684)
From: David Woodhouse @ 2012-12-02 21:29 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <1354436040.21562.386.camel@shinybook.infradead.org>
[-- Attachment #1: Type: text/plain, Size: 4031 bytes --]
On Sun, 2012-12-02 at 08:14 +0000, David Woodhouse wrote:
> On Sat, 2012-12-01 at 20:49 -0500, David Miller wrote:
> >
> > I actually prefer what we do now, which is do the BUILD_BUG_ON()
> > once in the subsystem specific code, usually the initializer.
> >
> > It's part of creating a new SKB cb, adding that assertion somewhere.
>
> Where it's *subsystem* code that's great...
Hmm... or maybe not. A quick check suggests that about two-thirds of
users even in net/ aren't actually doing the check. My brief
investigation gives a score of 30 to 14 (or thereabouts; there may be
some that *do* check, but I failed to spot it. And I used a Fedora
config, not allyesconfig).
If you don't want an automatic check/cast macro (which is only adding a
compile-time check; no runtime overhead), then is it worth doing a
bombing run on the offenders listed below and adding the manual checks?
And I'll look at *drivers* next... which I suspect will be worse.
I concede there are probably no actual *bugs* being hidden here — I
don't think any of them actually *do* overflow. But since the check is
free at run-time, we *ought* to be doing it. Even if a given struct is
tiny and there's no *chance* of it overflowing, people might still add
to it. After all, the solos_skb_cb struct was tiny too until I stupidly
added a completion to it.
(Actually, I'm not entirely sure about 'no bugs'. The L2TP thing with
starting its own struct at &skb->cb[sizeof(struct inet_skb_parm)]
doesn't make it overflow, but what the hell is l2tp_xmit_skb() doing
poking at IPCB(skb) anyway... is it even guaranteed to be Legacy IP? Can
it not be IPv6? And why would anything else *trust* the contents of ->cb
on a skb that just got handed to it?)
My list:
Size not checked against sizeof(skb->cb):
struct napi_gro_cb (include/linux/netdevice.h)
struct ip6_mtuinfo (include/linux/ipv6.h)
struct sctp_ulpevent (include/net/sctp/ulpevent.h)
struct bt_skb_cb (include/net/bluetooth/bluetooth.h)
struct atm_skb_data (include/linux/atmdev.h)
struct br_input_skb_cb (net/bridge/br_private.h)
struct hci_cb (include/net/bluetooth/hci_core.h)
struct sock_exterr_skb (include/linux/errqueue.h)
struct udp_skb_cb (include/net/udp.h)
struct ipx_cb (include/net/ipx.h)
struct neighbour_cb (include/net/neighbour.h)
struct ip6frag_skb_cb (net/ipv6/reassembly.c)
struct dev_gso_cb (net/core/dev.c)
struct irda_skb_cb (include/net/irda/irda_device.h)
struct cmtp_skb (net/bluetooth/cmtp/cmtp.h)
struct ipfrag_skb_cb (net/ipv6/ip_fragment.c)
struct xfrm_skb_cb (include/net/xfrm.h)
struct xfrm_mode_skb_cb (include/net/xfrm.h)
struct xfrm_spi_skb_cb (include/net/xfrm.h)
struct in_pktinfo (include/uapi/linux/in.h)
struct nf_ct_frag6_skb_cb (net/ipv6/netfilter/nf_conntrack_reasm.c)
struct l2tp_skb_cb (net/l2tp/l2tp_core.c) (less than full cb)
struct ah_skb_cb (net/ipv4/ah4.c)
struct ah_skb_cb (net/ipv6/ah6.c)
struct esp_skb_cb (net/ipv4/esp4.c)
struct esp_skb_cb (net/ipv6/esp6.c)
struct skb_eosp_msg_data (net/mac80211/ieee80211_i.h)
struct mISDNhead (include/linux/mISDNif.h)
struct ieee80211_ra_tid (net/mac80211/iface.c)
struct sctp_input_cb (net/sctp/input.c)
Checked:
struct unix_skb_parms (include/net/af_unix.h)
struct packet_skb_cb (net/packet/af_packet.c)
struct ovs_skb_cb (net/openvswitch/datapath.h)
struct ieee80211_tx_info (include/net/mac80211.h)
struct ieee80211_rx_status (include/net/mac80211.h)
struct tcp_skb_cb (include/net/tcp.h)
struct ieee802154_mac_cb (include/net/ieee802154_netdev.h)
struct netlink_cb_parms (include/linux/netlink.h)
struct inet6_skb_parm (include/linux/ipv6.h)
struct inet_skb_parm (include/net/ip.h)
struct tcp_skb_cb (include/net/tcp.h)
struct garp_skb_cb (include/net/garp.h)
struct dccp_skb_cb (net/dccp/dccp.h)
struct qdisc_skb_cb (include/net/sch_generic.h)
--
David Woodhouse Open Source Technology Centre
David.Woodhouse@intel.com Intel Corporation
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]
^ permalink raw reply
* Re: Optics (SFP) monitoring on ixgbe and igbe
From: Aurélien @ 2012-12-02 21:47 UTC (permalink / raw)
To: Ben Hutchings; +Cc: netdev
In-Reply-To: <1354335498.2640.23.camel@bwh-desktop.uk.solarflarecom.com>
[-- Attachment #1: Type: text/plain, Size: 2554 bytes --]
Hi Ben,
Thanks for your review. Here's a fixed-up version, according to your remarks.
On Sat, Dec 1, 2012 at 5:18 AM, Ben Hutchings <bhutchings@solarflare.com> wrote:
>
> This version drops the -lm completely, so it doesn't link. Maybe you
> edited the generated Makefile or Makefile.in?
No, I just stupidly forgot to make distclean & autogen after removing
all libm checks. Re-added AC_CHECK_LIB to link with it.
>
> The option alias should be included in the manual page and in a
> (trivial) test case in test-cmdline.c.
>
Included in the man page, along with a modified option description,
and a test equivalent to --dump-module-eeprom, which passes.
>
> The indentation is still weird, though:
>
> [...]
>
> These comments should be lined up vertically.
Yup, I had a mixup between tab/whitespaces, and my Vim config did not
help. Fixed.
>
> be32toh() is non-standard and was apparently added to glibc relatively
> recently (version 2.9). Therefore please use the equivalent ntohl()
> instead.
Did that, it indeed works just fine. I should have used that from the start.
>
> Function-like macros generally shouldn't be defined with a trailing
> semi-colon, as that will be added at the point of use.
>
That was a copy/paste typo, fixed.
> The backslashes should be lined up on the right, and continuation lines
> within parentheses should be indented so they begin just to the right of
> the opening parenthesis, e.g.:
>
> #define PRINT_VCC(string, index) \
> printf("\t%-41s : %.4f V\n", (string), \
> (double)(sd.sfp_voltage[(index)] / 10000.))
>
> [...]
>> + PRINT_xX_PWR("Laser output power low warning threshold",
>> + sd.tx_power, LWARN);
>
> The continuation lines are over-indented here.
>
Fixed (was using wrong tabstop width at 4).
>> - printf("\tActive Cu cmplnce. : 0x%02x", id[60]);
>> + printf("\t%-41s : 0x%02x", "Active copper compliance", id[60]);
>
> If you want to change these labels, do that in a separate patch.
>
There's no real need, so I'll leave those alone, just changing the
alignment like for the other labels.
> [...]
>> - "Length (62.5um)", 10, "m");
>> + "Length (62.5um)", 10, "m");
>
> These changes are unnecessary.
>
Agreed, removed from the patch.
Best regards,
--
Aurélien Guillaume
[-- Attachment #2: 0001-Implemented-basic-optics-diagnostics-for-SFF-8472.patch --]
[-- Type: application/octet-stream, Size: 22638 bytes --]
From 9a6e14770f12aa728751e3c9256968b4fd611290 Mon Sep 17 00:00:00 2001
From: Aurelien Guillaume <aurelien@iwi.me>
Date: Sun, 2 Dec 2012 21:21:01 +0100
Subject: [PATCH] Implemented basic optics diagnostics for SFF-8472
Signed-off-by: Aurelien Guillaume <aurelien@iwi.me>
---
Makefile.am | 2 +-
configure.ac | 1 +
ethtool.8.in | 8 +-
ethtool.c | 17 +++-
internal.h | 3 +
sfpdiag.c | 362 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
sfpid.c | 28 +++--
test-cmdline.c | 2 +
8 files changed, 404 insertions(+), 19 deletions(-)
create mode 100644 sfpdiag.c
diff --git a/Makefile.am b/Makefile.am
index e33f71f..89a0d1e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -9,7 +9,7 @@ ethtool_SOURCES = ethtool.c ethtool-copy.h internal.h net_tstamp-copy.h \
fec_8xx.c ibm_emac.c ixgb.c ixgbe.c natsemi.c \
pcnet32.c realtek.c tg3.c marvell.c vioc.c \
smsc911x.c at76c50x-usb.c sfc.c stmmac.c \
- rxclass.c sfpid.c
+ rxclass.c sfpid.c sfpdiag.c
TESTS = test-cmdline test-features
check_PROGRAMS = test-cmdline test-features
diff --git a/configure.ac b/configure.ac
index 0c597c6..5806eae 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,6 +13,7 @@ AC_PROG_GCC_TRADITIONAL
AM_PROG_CC_C_O
dnl Checks for libraries.
+AC_CHECK_LIB([m], [log10])
dnl Checks for header files.
AC_CHECK_HEADERS(sys/ioctl.h)
diff --git a/ethtool.8.in b/ethtool.8.in
index a3c7fbb..e701919 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -312,7 +312,7 @@ ethtool \- query or control network driver and hardware settings
.BN other
.BN combined
.HP
-.B ethtool \-m|\-\-dump\-module\-eeprom
+.B ethtool \-m|\-\-dump\-module\-eeprom|\-\-module\-info
.I devname
.B2 raw on off
.B2 hex on off
@@ -815,8 +815,10 @@ Changes the number of channels used only for other purposes e.g. link interrupts
.BI combined \ N
Changes the number of multi-purpose channels.
.TP
-.B \-m \-\-dump\-module\-eeprom
-Retrieves and if possible decodes the EEPROM from plugin modules, e.g SFP+, QSFP
+.B \-m \-\-dump\-module\-eeprom \-\-module\-info
+Retrieves and if possible decodes the EEPROM from plugin modules, e.g SFP+, QSFP.
+If the driver and module support it, the optical diagnostic information is also
+read and decoded.
.TP
.B \-\-show\-priv\-flags
Queries the specified network device for its private flags. The
diff --git a/ethtool.c b/ethtool.c
index 3db7fec..345c21c 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -3604,6 +3604,16 @@ static int do_getmodule(struct cmd_context *ctx)
return 1;
}
+ /*
+ * SFF-8079 EEPROM layout contains the memory available at A0 address on
+ * the PHY EEPROM.
+ * SFF-8472 defines a virtual extension of the EEPROM, where the
+ * microcontroller on the SFP/SFP+ generates a page at the A2 address,
+ * which contains data relative to optical diagnostics.
+ * The current kernel implementation returns a blob, which contains:
+ * - ETH_MODULE_SFF_8079 => The A0 page only.
+ * - ETH_MODULE_SFF_8472 => The A0 and A2 page concatenated.
+ */
if (geeprom_dump_raw) {
fwrite(eeprom->data, 1, eeprom->len, stdout);
} else {
@@ -3613,8 +3623,11 @@ static int do_getmodule(struct cmd_context *ctx)
} else if (!geeprom_dump_hex) {
switch (modinfo.type) {
case ETH_MODULE_SFF_8079:
+ sff8079_show_all(eeprom->data);
+ break;
case ETH_MODULE_SFF_8472:
sff8079_show_all(eeprom->data);
+ sff8472_show_all(eeprom->data);
break;
default:
geeprom_dump_hex = 1;
@@ -3831,8 +3844,8 @@ static const struct option {
{ "--show-priv-flags" , 1, do_gprivflags, "Query private flags" },
{ "--set-priv-flags", 1, do_sprivflags, "Set private flags",
" FLAG on|off ...\n" },
- { "-m|--dump-module-eeprom", 1, do_getmodule,
- "Qeuery/Decode Module EEPROM information",
+ { "-m|--dump-module-eeprom|--module-info", 1, do_getmodule,
+ "Query/Decode Module EEPROM information and optical diagnostics if available",
" [ raw on|off ]\n"
" [ hex on|off ]\n"
" [ offset N ]\n"
diff --git a/internal.h b/internal.h
index 4f96fd5..e977a81 100644
--- a/internal.h
+++ b/internal.h
@@ -253,4 +253,7 @@ int rxclass_rule_del(struct cmd_context *ctx, __u32 loc);
/* Module EEPROM parsing code */
void sff8079_show_all(const __u8 *id);
+/* Optics diagnostics */
+void sff8472_show_all(const __u8 *id);
+
#endif /* ETHTOOL_INTERNAL_H__ */
diff --git a/sfpdiag.c b/sfpdiag.c
new file mode 100644
index 0000000..f67e491
--- /dev/null
+++ b/sfpdiag.c
@@ -0,0 +1,362 @@
+/*
+ * sfpdiag.c: Implements SFF-8472 optics diagnostics.
+ *
+ * Aurelien Guillaume <aurelien@iwi.me> (C) 2012
+ * This implementation is loosely based on DOM patches
+ * from Robert Olsson <robert@herjulf.se> (C) 2009
+ * and SFF-8472 specs (ftp://ftp.seagate.com/pub/sff/SFF-8472.PDF)
+ * by SFF Committee.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <arpa/inet.h>
+#include "internal.h"
+
+/* Offsets in decimal, for direct comparison with the SFF specs */
+
+/* A0-based EEPROM offsets for DOM support checks */
+#define SFF_A0_DOM 92
+#define SFF_A0_OPTIONS 93
+#define SFF_A0_COMP 94
+
+/* EEPROM bit values for various registers */
+#define SFF_A0_DOM_EXTCAL (1 << 4)
+#define SFF_A0_DOM_INTCAL (1 << 5)
+#define SFF_A0_DOM_IMPL (1 << 6)
+#define SFF_A0_DOM_PWRT (1 << 3)
+
+#define SFF_A0_OPTIONS_AW (1 << 7)
+
+/*
+ * See ethtool.c comments about SFF-8472, this is the offset
+ * at which the A2 page is in the EEPROM blob returned by the
+ * kernel.
+ */
+#define SFF_A2_BASE 0x100
+
+/* A2-based offsets for DOM */
+#define SFF_A2_TEMP 96
+#define SFF_A2_TEMP_HALRM 0
+#define SFF_A2_TEMP_LALRM 2
+#define SFF_A2_TEMP_HWARN 4
+#define SFF_A2_TEMP_LWARN 6
+
+#define SFF_A2_VCC 98
+#define SFF_A2_VCC_HALRM 8
+#define SFF_A2_VCC_LALRM 10
+#define SFF_A2_VCC_HWARN 12
+#define SFF_A2_VCC_LWARN 14
+
+#define SFF_A2_BIAS 96
+#define SFF_A2_BIAS_HALRM 16
+#define SFF_A2_BIAS_LALRM 18
+#define SFF_A2_BIAS_HWARN 20
+#define SFF_A2_BIAS_LWARN 22
+
+#define SFF_A2_TX_PWR 102
+#define SFF_A2_TX_PWR_HALRM 24
+#define SFF_A2_TX_PWR_LALRM 26
+#define SFF_A2_TX_PWR_HWARN 28
+#define SFF_A2_TX_PWR_LWARN 30
+
+#define SFF_A2_RX_PWR 104
+#define SFF_A2_RX_PWR_HALRM 32
+#define SFF_A2_RX_PWR_LALRM 34
+#define SFF_A2_RX_PWR_HWARN 36
+#define SFF_A2_RX_PWR_LWARN 38
+
+#define SFF_A2_ALRM_FLG 112
+#define SFF_A2_WARN_FLG 116
+
+/* 32-bit little-endian calibration constants */
+#define SFF_A2_CAL_RXPWR4 56
+#define SFF_A2_CAL_RXPWR3 60
+#define SFF_A2_CAL_RXPWR2 64
+#define SFF_A2_CAL_RXPWR1 68
+#define SFF_A2_CAL_RXPWR0 72
+
+/* 16-bit little endian calibration constants */
+#define SFF_A2_CAL_TXI_SLP 76
+#define SFF_A2_CAL_TXI_OFF 78
+#define SFF_A2_CAL_TXPWR_SLP 80
+#define SFF_A2_CAL_TXPWR_OFF 82
+#define SFF_A2_CAL_T_SLP 84
+#define SFF_A2_CAL_T_OFF 86
+#define SFF_A2_CAL_V_SLP 88
+#define SFF_A2_CAL_V_OFF 90
+
+
+struct sff8472_diags {
+
+#define MCURR 0
+#define LWARN 1
+#define HWARN 2
+#define LALRM 3
+#define HALRM 4
+
+ /* [5] tables are current, low/high warn, low/high alarm */
+ __u8 supports_dom; /* Supports DOM */
+ __u8 supports_alarms; /* Supports alarm/warning thold */
+ __u8 calibrated_ext; /* Is externally calibrated */
+ __u16 bias_cur[5]; /* Measured bias current in 2uA units */
+ __u16 tx_power[5]; /* Measured TX Power in 0.1uW units */
+ __u16 rx_power[5]; /* Measured RX Power */
+ __u8 rx_power_type; /* 0 = OMA, 1 = Average power */
+ __s16 sfp_temp[5]; /* SFP Temp in 16-bit signed 1/256 Celcius */
+ __u16 sfp_voltage[5]; /* SFP voltage in 0.1mV units */
+
+};
+
+static struct sff8472_aw_flags {
+ const char *str; /* Human-readable string, null at the end */
+ int offset; /* A2-relative adress offset */
+ __u8 value; /* Alarm is on if (offset & value) != 0. */
+} sff8472_aw_flags[] = {
+ { "Laser bias current high alarm", SFF_A2_ALRM_FLG, (1 << 3) },
+ { "Laser bias current low alarm", SFF_A2_ALRM_FLG, (1 << 2) },
+ { "Laser bias current high warning", SFF_A2_WARN_FLG, (1 << 3) },
+ { "Laser bias current low warning", SFF_A2_WARN_FLG, (1 << 2) },
+
+ { "Laser output power high alarm", SFF_A2_ALRM_FLG, (1 << 1) },
+ { "Laser output power low alarm", SFF_A2_ALRM_FLG, (1 << 0) },
+ { "Laser output power high warning", SFF_A2_WARN_FLG, (1 << 1) },
+ { "Laser output power low warning", SFF_A2_WARN_FLG, (1 << 0) },
+
+ { "Module temperature high alarm", SFF_A2_ALRM_FLG, (1 << 7) },
+ { "Module temperature low alarm", SFF_A2_ALRM_FLG, (1 << 6) },
+ { "Module temperature high warning", SFF_A2_WARN_FLG, (1 << 7) },
+ { "Module temperature low warning", SFF_A2_WARN_FLG, (1 << 6) },
+
+ { "Module voltage high alarm", SFF_A2_ALRM_FLG, (1 << 5) },
+ { "Module voltage low alarm", SFF_A2_ALRM_FLG, (1 << 4) },
+ { "Module voltage high warning", SFF_A2_WARN_FLG, (1 << 5) },
+ { "Module voltage low warning", SFF_A2_WARN_FLG, (1 << 4) },
+
+ { "Laser rx power high alarm", SFF_A2_ALRM_FLG + 1, (1 << 7) },
+ { "Laser rx power low alarm", SFF_A2_ALRM_FLG + 1, (1 << 6) },
+ { "Laser rx power high warning", SFF_A2_WARN_FLG + 1, (1 << 7) },
+ { "Laser rx power low warning", SFF_A2_WARN_FLG + 1, (1 << 6) },
+
+ { NULL, 0, 0 },
+};
+
+static double convert_mw_to_dbm(double mw)
+{
+ return (10. * log10(mw / 1000.)) + 30.;
+}
+
+
+/* Most common case: 16-bit unsigned integer in a certain unit */
+#define A2_OFFSET_TO_U16(offset) \
+ (id[SFF_A2_BASE + (offset)] << 8 | id[SFF_A2_BASE + (offset) + 1])
+
+/* Calibration slope is a number between 0.0 included and 256.0 excluded. */
+#define A2_OFFSET_TO_SLP(offset) \
+ (id[SFF_A2_BASE + (offset)] + id[SFF_A2_BASE + (offset) + 1] / 256.)
+
+/* Calibration offset is an integer from -32768 to 32767 */
+#define A2_OFFSET_TO_OFF(offset) \
+ ((__s16)A2_OFFSET_TO_U16(offset))
+
+/* RXPWR(x) are IEEE-754 floating point numbers in big-endian format */
+#define A2_OFFSET_TO_RXPWRx(offset) \
+ (befloattoh((__u32 *)(id + SFF_A2_BASE + (offset))))
+
+/*
+ * 2-byte internal temperature conversions:
+ * First byte is a signed 8-bit integer, which is the temp decimal part
+ * Second byte are 1/256th of degree, which are added to the dec part.
+ */
+#define A2_OFFSET_TO_TEMP(offset) ((__s16)A2_OFFSET_TO_U16(offset))
+
+
+static void sff8472_dom_parse(const __u8 *id, struct sff8472_diags *sd)
+{
+
+ sd->bias_cur[MCURR] = A2_OFFSET_TO_U16(SFF_A2_BIAS);
+ sd->bias_cur[HALRM] = A2_OFFSET_TO_U16(SFF_A2_BIAS_HALRM);
+ sd->bias_cur[LALRM] = A2_OFFSET_TO_U16(SFF_A2_BIAS_LALRM);
+ sd->bias_cur[HWARN] = A2_OFFSET_TO_U16(SFF_A2_BIAS_HWARN);
+ sd->bias_cur[LWARN] = A2_OFFSET_TO_U16(SFF_A2_BIAS_LWARN);
+
+ sd->sfp_voltage[MCURR] = A2_OFFSET_TO_U16(SFF_A2_VCC);
+ sd->sfp_voltage[HALRM] = A2_OFFSET_TO_U16(SFF_A2_VCC_HALRM);
+ sd->sfp_voltage[LALRM] = A2_OFFSET_TO_U16(SFF_A2_VCC_LALRM);
+ sd->sfp_voltage[HWARN] = A2_OFFSET_TO_U16(SFF_A2_VCC_HWARN);
+ sd->sfp_voltage[LWARN] = A2_OFFSET_TO_U16(SFF_A2_VCC_LWARN);
+
+ sd->tx_power[MCURR] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR);
+ sd->tx_power[HALRM] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_HALRM);
+ sd->tx_power[LALRM] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_LALRM);
+ sd->tx_power[HWARN] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_HWARN);
+ sd->tx_power[LWARN] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_LWARN);
+
+ sd->rx_power[MCURR] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR);
+ sd->rx_power[HALRM] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_HALRM);
+ sd->rx_power[LALRM] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_LALRM);
+ sd->rx_power[HWARN] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_HWARN);
+ sd->rx_power[LWARN] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_LWARN);
+
+ sd->sfp_temp[MCURR] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP);
+ sd->sfp_temp[HALRM] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_HALRM);
+ sd->sfp_temp[LALRM] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_LALRM);
+ sd->sfp_temp[HWARN] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_HWARN);
+ sd->sfp_temp[LWARN] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_LWARN);
+
+}
+
+/* Converts to a float from a big-endian 4-byte source buffer. */
+static float befloattoh(const __u32 *source)
+{
+ union {
+ __u32 src;
+ float dst;
+ } converter;
+
+ converter.src = ntohl(*source);
+ return converter.dst;
+}
+
+static void sff8472_calibration(const __u8 *id, struct sff8472_diags *sd)
+{
+ int i;
+ __u16 rx_reading;
+
+ /* Calibration should occur for all values (threshold and current) */
+ for (i = 0; i < sizeof(sd->bias_cur); ++i) {
+ /*
+ * Apply calibration formula 1 (Temp., Voltage, Bias, Tx Power)
+ */
+ sd->bias_cur[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_TXI_SLP);
+ sd->tx_power[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_TXPWR_SLP);
+ sd->sfp_voltage[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_V_SLP);
+ sd->sfp_temp[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_T_SLP);
+
+ sd->bias_cur[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_TXI_OFF);
+ sd->tx_power[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_TXPWR_OFF);
+ sd->sfp_voltage[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_V_OFF);
+ sd->sfp_temp[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_T_OFF);
+
+ /*
+ * Apply calibration formula 2 (Rx Power only)
+ */
+ rx_reading = sd->rx_power[i];
+ sd->rx_power[i] = A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR0);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR1);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR2);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR3);
+ }
+}
+
+static void sff8472_parse_eeprom(const __u8 *id, struct sff8472_diags *sd)
+{
+ sd->supports_dom = id[SFF_A0_DOM] & SFF_A0_DOM_IMPL;
+ sd->supports_alarms = id[SFF_A0_OPTIONS] & SFF_A0_OPTIONS_AW;
+ sd->calibrated_ext = id[SFF_A0_DOM] & SFF_A0_DOM_EXTCAL;
+ sd->rx_power_type = id[SFF_A0_DOM] & SFF_A0_DOM_PWRT;
+
+ sff8472_dom_parse(id, sd);
+
+ /*
+ * If the SFP is externally calibrated, we need to read calibration data
+ * and compensate the already stored readings.
+ */
+ if (sd->calibrated_ext)
+ sff8472_calibration(id, sd);
+}
+
+void sff8472_show_all(const __u8 *id)
+{
+ struct sff8472_diags sd;
+ char *rx_power_string = NULL;
+ int i;
+
+ sff8472_parse_eeprom(id, &sd);
+
+ if (!sd.supports_dom) {
+ printf("\t%-41s : No\n", "Optical diagnostics support");
+ return ;
+ }
+ printf("\t%-41s : Yes\n", "Optical diagnostics support");
+
+#define PRINT_BIAS(string, index) \
+ printf("\t%-41s : %.3f mA\n", (string), \
+ (double)(sd.bias_cur[(index)] / 500.))
+
+# define PRINT_xX_PWR(string, var, index) \
+ printf("\t%-41s : %.4f mW / %.2f dBm\n", (string), \
+ (double)((var)[(index)] / 10000.), \
+ convert_mw_to_dbm((double)((var)[(index)] / 10000.)))
+
+#define PRINT_TEMP(string, index) \
+ printf("\t%-41s : %.2f degrees C / %.2f degrees F\n", (string), \
+ (double)(sd.sfp_temp[(index)] / 256.), \
+ (double)(sd.sfp_temp[(index)] / 256. * 1.8 + 32.))
+
+#define PRINT_VCC(string, index) \
+ printf("\t%-41s : %.4f V\n", (string), \
+ (double)(sd.sfp_voltage[(index)] / 10000.))
+
+ PRINT_BIAS("Laser bias current", MCURR);
+ PRINT_xX_PWR("Laser output power", sd.tx_power, MCURR);
+
+ if (!sd.rx_power_type)
+ rx_power_string = "Receiver signal OMA";
+ else
+ rx_power_string = "Receiver signal average optical power";
+
+ PRINT_xX_PWR(rx_power_string, sd.rx_power, MCURR);
+
+ PRINT_TEMP("Module temperature", MCURR);
+ PRINT_VCC("Module voltage", MCURR);
+
+ printf("\t%-41s : %s\n", "Alarm/warning flags implemented",
+ (sd.supports_alarms ? "Yes" : "No"));
+ if (sd.supports_alarms) {
+
+ for (i = 0; sff8472_aw_flags[i].str; ++i) {
+ printf("\t%-41s : %s\n", sff8472_aw_flags[i].str,
+ id[SFF_A2_BASE + sff8472_aw_flags[i].offset]
+ & sff8472_aw_flags[i].value ? "On" : "Off");
+ }
+
+ PRINT_BIAS("Laser bias current high alarm threshold", HALRM);
+ PRINT_BIAS("Laser bias current low alarm threshold", LALRM);
+ PRINT_BIAS("Laser bias current high warning threshold", HWARN);
+ PRINT_BIAS("Laser bias current low warning threshold", LWARN);
+
+ PRINT_xX_PWR("Laser output power high alarm threshold",
+ sd.tx_power, HALRM);
+ PRINT_xX_PWR("Laser output power low alarm threshold",
+ sd.tx_power, LALRM);
+ PRINT_xX_PWR("Laser output power high warning threshold",
+ sd.tx_power, HWARN);
+ PRINT_xX_PWR("Laser output power low warning threshold",
+ sd.tx_power, LWARN);
+
+ PRINT_TEMP("Module temperature high alarm threshold", HALRM);
+ PRINT_TEMP("Module temperature low alarm threshold", LALRM);
+ PRINT_TEMP("Module temperature high warning threshold", HWARN);
+ PRINT_TEMP("Module temperature low warning threshold", LWARN);
+
+ PRINT_VCC("Module voltage high alarm threshold", HALRM);
+ PRINT_VCC("Module voltage low alarm threshold", LALRM);
+ PRINT_VCC("Module voltage high warning threshold", HWARN);
+ PRINT_VCC("Module voltage low warning threshold", LWARN);
+
+ PRINT_xX_PWR("Laser rx power high alarm threshold",
+ sd.rx_power, HALRM);
+ PRINT_xX_PWR("Laser rx power low alarm threshold",
+ sd.rx_power, LALRM);
+ PRINT_xX_PWR("Laser rx power high warning threshold",
+ sd.rx_power, HWARN);
+ PRINT_xX_PWR("Laser rx power low warning threshold",
+ sd.rx_power, LWARN);
+ }
+
+}
+
diff --git a/sfpid.c b/sfpid.c
index a4a671d..4f88aa2 100644
--- a/sfpid.c
+++ b/sfpid.c
@@ -12,7 +12,7 @@
static void sff8079_show_identifier(const __u8 *id)
{
- printf("\tIdentifier : 0x%02x", id[0]);
+ printf("\t%-41s : 0x%02x", "Identifier", id[0]);
switch (id[0]) {
case 0x00:
printf(" (no module present, unknown, or unspecified)\n");
@@ -34,7 +34,7 @@ static void sff8079_show_identifier(const __u8 *id)
static void sff8079_show_ext_identifier(const __u8 *id)
{
- printf("\tExtended identifier : 0x%02x", id[1]);
+ printf("\t%-41s : 0x%02x", "Extended identifier", id[1]);
if (id[1] == 0x00)
printf(" (GBIC not specified / not MOD_DEF compliant)\n");
else if (id[1] == 0x04)
@@ -47,7 +47,7 @@ static void sff8079_show_ext_identifier(const __u8 *id)
static void sff8079_show_connector(const __u8 *id)
{
- printf("\tConnector : 0x%02x", id[2]);
+ printf("\t%-41s : 0x%02x", "Connector", id[2]);
switch (id[2]) {
case 0x00:
printf(" (unknown or unspecified)\n");
@@ -105,10 +105,12 @@ static void sff8079_show_connector(const __u8 *id)
static void sff8079_show_transceiver(const __u8 *id)
{
- static const char *pfx = "\t : =>";
+ static const char *pfx =
+ "\tTransceiver type :";
- printf("\tTransceiver codes : 0x%02x 0x%02x 0x%02x" \
+ printf("\t%-41s : 0x%02x 0x%02x 0x%02x " \
"0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
+ "Transceiver codes",
id[3], id[4], id[5], id[6],
id[7], id[8], id[9], id[10]);
/* 10G Ethernet Compliance Codes */
@@ -239,7 +241,7 @@ static void sff8079_show_transceiver(const __u8 *id)
static void sff8079_show_encoding(const __u8 *id)
{
- printf("\tEncoding : 0x%02x", id[11]);
+ printf("\t%-41s : 0x%02x", "Encoding", id[11]);
switch (id[11]) {
case 0x00:
printf(" (unspecified)\n");
@@ -270,7 +272,7 @@ static void sff8079_show_encoding(const __u8 *id)
static void sff8079_show_rate_identifier(const __u8 *id)
{
- printf("\tRate identifier : 0x%02x", id[13]);
+ printf("\t%-41s : 0x%02x", "Rate identifier", id[13]);
switch (id[13]) {
case 0x00:
printf(" (unspecified)\n");
@@ -295,14 +297,14 @@ static void sff8079_show_rate_identifier(const __u8 *id)
static void sff8079_show_oui(const __u8 *id)
{
- printf("\tVendor OUI : %02x:%02x:%02x\n",
+ printf("\t%-41s : %02x:%02x:%02x\n", "Vendor OUI",
id[37], id[38], id[39]);
}
static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
{
if (id[8] & (1 << 2)) {
- printf("\tPassive Cu cmplnce. : 0x%02x", id[60]);
+ printf("\t%-41s : 0x%02x", "Passive Cu cmplnce.", id[60]);
switch (id[60]) {
case 0x00:
printf(" (unspecified)");
@@ -316,7 +318,7 @@ static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
}
printf(" [SFF-8472 rev10.4 only]\n");
} else if (id[8] & (1 << 3)) {
- printf("\tActive Cu cmplnce. : 0x%02x", id[60]);
+ printf("\t%-41s : 0x%02x", "Active Cu cmplnce.", id[60]);
switch (id[60]) {
case 0x00:
printf(" (unspecified)");
@@ -333,7 +335,7 @@ static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
}
printf(" [SFF-8472 rev10.4 only]\n");
} else {
- printf("\tLaser wavelength : %unm\n",
+ printf("\t%-41s : %unm\n", "Laser wavelength",
(id[60] << 8) | id[61]);
}
}
@@ -344,7 +346,7 @@ static void sff8079_show_value_with_unit(const __u8 *id, unsigned int reg,
{
unsigned int val = id[reg];
- printf("\t%-20s: %u%s\n", name, val * mult, unit);
+ printf("\t%-41s : %u%s\n", name, val * mult, unit);
}
static void sff8079_show_ascii(const __u8 *id, unsigned int first_reg,
@@ -352,7 +354,7 @@ static void sff8079_show_ascii(const __u8 *id, unsigned int first_reg,
{
unsigned int reg, val;
- printf("\t%-20s: ", name);
+ printf("\t%-41s : ", name);
for (reg = first_reg; reg <= last_reg; reg++) {
val = id[reg];
putchar(((val >= 32) && (val <= 126)) ? val : '_');
diff --git a/test-cmdline.c b/test-cmdline.c
index 85b4ce0..f1d4555 100644
--- a/test-cmdline.c
+++ b/test-cmdline.c
@@ -213,6 +213,8 @@ static struct test_case {
{ 0, "-m devname" },
{ 1, "--dump-module-eeprom" },
{ 0, "--dump-module-eeprom devname" },
+ { 1, "--module-info" },
+ { 0, "--module-info devname" },
{ 0, "-m devname raw on" },
{ 0, "-m devname raw off" },
{ 0, "-m devname hex on" },
--
1.7.0.4
^ permalink raw reply related
* [PATCH net-next] tcp: don't abort splice() after small transfers
From: Eric Dumazet @ 2012-12-02 21:49 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Willy Tarreau
From: Willy Tarreau <w@1wt.eu>
TCP coalescing added a regression in splice(socket->pipe) performance,
for some workloads because of the way tcp_read_sock() is implemented.
The reason for this is the break when (offset + 1 != skb->len).
As we released the socket lock, this condition is possible if TCP stack
added a fragment to the skb, which can happen with TCP coalescing.
So let's go back to the beginning of the loop when this happens,
to give a chance to splice more frags per system call.
Doing so fixes the issue and makes GRO 10% faster than LRO
on CPU-bound splice() workloads instead of the opposite.
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/tcp.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1aca02c..8fc5b3bd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1494,15 +1494,19 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
copied += used;
offset += used;
}
- /*
- * If recv_actor drops the lock (e.g. TCP splice
+ /* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
* while aggregating skbs from the socket queue.
*/
- skb = tcp_recv_skb(sk, seq-1, &offset);
- if (!skb || (offset+1 != skb->len))
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
}
if (tcp_hdr(skb)->fin) {
sk_eat_skb(sk, skb, false);
^ permalink raw reply related
* Re: Optics (SFP) monitoring on ixgbe and igbe
From: Aurélien @ 2012-12-02 22:00 UTC (permalink / raw)
To: Ben Hutchings; +Cc: netdev
In-Reply-To: <CAPN4dA_Ar8rnWE14Dq6uJHhrMY9ttEE4XcWg88jcVe4zc=_e8g@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 571 bytes --]
On Sun, Dec 2, 2012 at 10:47 PM, Aurélien <footplus@gmail.com> wrote:
>>
>> This version drops the -lm completely, so it doesn't link. Maybe you
>> edited the generated Makefile or Makefile.in?
>
> No, I just stupidly forgot to make distclean & autogen after removing
> all libm checks. Re-added AC_CHECK_LIB to link with it.
>
Just after re-reading this, I thought it was silly; I instead edited
Makefile.am and reverted the configure.ac change. Here's a new
full-patch with that fix along with the rest.
Sorry for the noise.
--
Aurélien Guillaume
[-- Attachment #2: 0001-Implemented-basic-optics-diagnostics-for-SFF-8472.patch --]
[-- Type: application/octet-stream, Size: 22479 bytes --]
From c6e58988c0c30123f78e0ae83730697bb2b159d0 Mon Sep 17 00:00:00 2001
From: Aurelien Guillaume <aurelien@iwi.me>
Date: Sun, 2 Dec 2012 21:21:01 +0100
Subject: [PATCH] Implemented basic optics diagnostics for SFF-8472
Signed-off-by: Aurelien Guillaume <aurelien@iwi.me>
---
Makefile.am | 3 +-
ethtool.8.in | 8 +-
ethtool.c | 17 +++-
internal.h | 3 +
sfpdiag.c | 362 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
sfpid.c | 28 +++--
test-cmdline.c | 2 +
7 files changed, 404 insertions(+), 19 deletions(-)
create mode 100644 sfpdiag.c
diff --git a/Makefile.am b/Makefile.am
index e33f71f..ba1faa6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,5 @@
AM_CFLAGS = -Wall
+LDADD = -lm
man_MANS = ethtool.8
EXTRA_DIST = LICENSE ethtool.8 ethtool.spec.in aclocal.m4 ChangeLog autogen.sh
@@ -9,7 +10,7 @@ ethtool_SOURCES = ethtool.c ethtool-copy.h internal.h net_tstamp-copy.h \
fec_8xx.c ibm_emac.c ixgb.c ixgbe.c natsemi.c \
pcnet32.c realtek.c tg3.c marvell.c vioc.c \
smsc911x.c at76c50x-usb.c sfc.c stmmac.c \
- rxclass.c sfpid.c
+ rxclass.c sfpid.c sfpdiag.c
TESTS = test-cmdline test-features
check_PROGRAMS = test-cmdline test-features
diff --git a/ethtool.8.in b/ethtool.8.in
index a3c7fbb..e701919 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -312,7 +312,7 @@ ethtool \- query or control network driver and hardware settings
.BN other
.BN combined
.HP
-.B ethtool \-m|\-\-dump\-module\-eeprom
+.B ethtool \-m|\-\-dump\-module\-eeprom|\-\-module\-info
.I devname
.B2 raw on off
.B2 hex on off
@@ -815,8 +815,10 @@ Changes the number of channels used only for other purposes e.g. link interrupts
.BI combined \ N
Changes the number of multi-purpose channels.
.TP
-.B \-m \-\-dump\-module\-eeprom
-Retrieves and if possible decodes the EEPROM from plugin modules, e.g SFP+, QSFP
+.B \-m \-\-dump\-module\-eeprom \-\-module\-info
+Retrieves and if possible decodes the EEPROM from plugin modules, e.g SFP+, QSFP.
+If the driver and module support it, the optical diagnostic information is also
+read and decoded.
.TP
.B \-\-show\-priv\-flags
Queries the specified network device for its private flags. The
diff --git a/ethtool.c b/ethtool.c
index 3db7fec..345c21c 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -3604,6 +3604,16 @@ static int do_getmodule(struct cmd_context *ctx)
return 1;
}
+ /*
+ * SFF-8079 EEPROM layout contains the memory available at A0 address on
+ * the PHY EEPROM.
+ * SFF-8472 defines a virtual extension of the EEPROM, where the
+ * microcontroller on the SFP/SFP+ generates a page at the A2 address,
+ * which contains data relative to optical diagnostics.
+ * The current kernel implementation returns a blob, which contains:
+ * - ETH_MODULE_SFF_8079 => The A0 page only.
+ * - ETH_MODULE_SFF_8472 => The A0 and A2 page concatenated.
+ */
if (geeprom_dump_raw) {
fwrite(eeprom->data, 1, eeprom->len, stdout);
} else {
@@ -3613,8 +3623,11 @@ static int do_getmodule(struct cmd_context *ctx)
} else if (!geeprom_dump_hex) {
switch (modinfo.type) {
case ETH_MODULE_SFF_8079:
+ sff8079_show_all(eeprom->data);
+ break;
case ETH_MODULE_SFF_8472:
sff8079_show_all(eeprom->data);
+ sff8472_show_all(eeprom->data);
break;
default:
geeprom_dump_hex = 1;
@@ -3831,8 +3844,8 @@ static const struct option {
{ "--show-priv-flags" , 1, do_gprivflags, "Query private flags" },
{ "--set-priv-flags", 1, do_sprivflags, "Set private flags",
" FLAG on|off ...\n" },
- { "-m|--dump-module-eeprom", 1, do_getmodule,
- "Qeuery/Decode Module EEPROM information",
+ { "-m|--dump-module-eeprom|--module-info", 1, do_getmodule,
+ "Query/Decode Module EEPROM information and optical diagnostics if available",
" [ raw on|off ]\n"
" [ hex on|off ]\n"
" [ offset N ]\n"
diff --git a/internal.h b/internal.h
index 4f96fd5..e977a81 100644
--- a/internal.h
+++ b/internal.h
@@ -253,4 +253,7 @@ int rxclass_rule_del(struct cmd_context *ctx, __u32 loc);
/* Module EEPROM parsing code */
void sff8079_show_all(const __u8 *id);
+/* Optics diagnostics */
+void sff8472_show_all(const __u8 *id);
+
#endif /* ETHTOOL_INTERNAL_H__ */
diff --git a/sfpdiag.c b/sfpdiag.c
new file mode 100644
index 0000000..f67e491
--- /dev/null
+++ b/sfpdiag.c
@@ -0,0 +1,362 @@
+/*
+ * sfpdiag.c: Implements SFF-8472 optics diagnostics.
+ *
+ * Aurelien Guillaume <aurelien@iwi.me> (C) 2012
+ * This implementation is loosely based on DOM patches
+ * from Robert Olsson <robert@herjulf.se> (C) 2009
+ * and SFF-8472 specs (ftp://ftp.seagate.com/pub/sff/SFF-8472.PDF)
+ * by SFF Committee.
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <arpa/inet.h>
+#include "internal.h"
+
+/* Offsets in decimal, for direct comparison with the SFF specs */
+
+/* A0-based EEPROM offsets for DOM support checks */
+#define SFF_A0_DOM 92
+#define SFF_A0_OPTIONS 93
+#define SFF_A0_COMP 94
+
+/* EEPROM bit values for various registers */
+#define SFF_A0_DOM_EXTCAL (1 << 4)
+#define SFF_A0_DOM_INTCAL (1 << 5)
+#define SFF_A0_DOM_IMPL (1 << 6)
+#define SFF_A0_DOM_PWRT (1 << 3)
+
+#define SFF_A0_OPTIONS_AW (1 << 7)
+
+/*
+ * See ethtool.c comments about SFF-8472, this is the offset
+ * at which the A2 page is in the EEPROM blob returned by the
+ * kernel.
+ */
+#define SFF_A2_BASE 0x100
+
+/* A2-based offsets for DOM */
+#define SFF_A2_TEMP 96
+#define SFF_A2_TEMP_HALRM 0
+#define SFF_A2_TEMP_LALRM 2
+#define SFF_A2_TEMP_HWARN 4
+#define SFF_A2_TEMP_LWARN 6
+
+#define SFF_A2_VCC 98
+#define SFF_A2_VCC_HALRM 8
+#define SFF_A2_VCC_LALRM 10
+#define SFF_A2_VCC_HWARN 12
+#define SFF_A2_VCC_LWARN 14
+
+#define SFF_A2_BIAS 96
+#define SFF_A2_BIAS_HALRM 16
+#define SFF_A2_BIAS_LALRM 18
+#define SFF_A2_BIAS_HWARN 20
+#define SFF_A2_BIAS_LWARN 22
+
+#define SFF_A2_TX_PWR 102
+#define SFF_A2_TX_PWR_HALRM 24
+#define SFF_A2_TX_PWR_LALRM 26
+#define SFF_A2_TX_PWR_HWARN 28
+#define SFF_A2_TX_PWR_LWARN 30
+
+#define SFF_A2_RX_PWR 104
+#define SFF_A2_RX_PWR_HALRM 32
+#define SFF_A2_RX_PWR_LALRM 34
+#define SFF_A2_RX_PWR_HWARN 36
+#define SFF_A2_RX_PWR_LWARN 38
+
+#define SFF_A2_ALRM_FLG 112
+#define SFF_A2_WARN_FLG 116
+
+/* 32-bit little-endian calibration constants */
+#define SFF_A2_CAL_RXPWR4 56
+#define SFF_A2_CAL_RXPWR3 60
+#define SFF_A2_CAL_RXPWR2 64
+#define SFF_A2_CAL_RXPWR1 68
+#define SFF_A2_CAL_RXPWR0 72
+
+/* 16-bit little endian calibration constants */
+#define SFF_A2_CAL_TXI_SLP 76
+#define SFF_A2_CAL_TXI_OFF 78
+#define SFF_A2_CAL_TXPWR_SLP 80
+#define SFF_A2_CAL_TXPWR_OFF 82
+#define SFF_A2_CAL_T_SLP 84
+#define SFF_A2_CAL_T_OFF 86
+#define SFF_A2_CAL_V_SLP 88
+#define SFF_A2_CAL_V_OFF 90
+
+
+struct sff8472_diags {
+
+#define MCURR 0
+#define LWARN 1
+#define HWARN 2
+#define LALRM 3
+#define HALRM 4
+
+ /* [5] tables are current, low/high warn, low/high alarm */
+ __u8 supports_dom; /* Supports DOM */
+ __u8 supports_alarms; /* Supports alarm/warning thold */
+ __u8 calibrated_ext; /* Is externally calibrated */
+ __u16 bias_cur[5]; /* Measured bias current in 2uA units */
+ __u16 tx_power[5]; /* Measured TX Power in 0.1uW units */
+ __u16 rx_power[5]; /* Measured RX Power */
+ __u8 rx_power_type; /* 0 = OMA, 1 = Average power */
+ __s16 sfp_temp[5]; /* SFP Temp in 16-bit signed 1/256 Celcius */
+ __u16 sfp_voltage[5]; /* SFP voltage in 0.1mV units */
+
+};
+
+static struct sff8472_aw_flags {
+ const char *str; /* Human-readable string, null at the end */
+ int offset; /* A2-relative adress offset */
+ __u8 value; /* Alarm is on if (offset & value) != 0. */
+} sff8472_aw_flags[] = {
+ { "Laser bias current high alarm", SFF_A2_ALRM_FLG, (1 << 3) },
+ { "Laser bias current low alarm", SFF_A2_ALRM_FLG, (1 << 2) },
+ { "Laser bias current high warning", SFF_A2_WARN_FLG, (1 << 3) },
+ { "Laser bias current low warning", SFF_A2_WARN_FLG, (1 << 2) },
+
+ { "Laser output power high alarm", SFF_A2_ALRM_FLG, (1 << 1) },
+ { "Laser output power low alarm", SFF_A2_ALRM_FLG, (1 << 0) },
+ { "Laser output power high warning", SFF_A2_WARN_FLG, (1 << 1) },
+ { "Laser output power low warning", SFF_A2_WARN_FLG, (1 << 0) },
+
+ { "Module temperature high alarm", SFF_A2_ALRM_FLG, (1 << 7) },
+ { "Module temperature low alarm", SFF_A2_ALRM_FLG, (1 << 6) },
+ { "Module temperature high warning", SFF_A2_WARN_FLG, (1 << 7) },
+ { "Module temperature low warning", SFF_A2_WARN_FLG, (1 << 6) },
+
+ { "Module voltage high alarm", SFF_A2_ALRM_FLG, (1 << 5) },
+ { "Module voltage low alarm", SFF_A2_ALRM_FLG, (1 << 4) },
+ { "Module voltage high warning", SFF_A2_WARN_FLG, (1 << 5) },
+ { "Module voltage low warning", SFF_A2_WARN_FLG, (1 << 4) },
+
+ { "Laser rx power high alarm", SFF_A2_ALRM_FLG + 1, (1 << 7) },
+ { "Laser rx power low alarm", SFF_A2_ALRM_FLG + 1, (1 << 6) },
+ { "Laser rx power high warning", SFF_A2_WARN_FLG + 1, (1 << 7) },
+ { "Laser rx power low warning", SFF_A2_WARN_FLG + 1, (1 << 6) },
+
+ { NULL, 0, 0 },
+};
+
+static double convert_mw_to_dbm(double mw)
+{
+ return (10. * log10(mw / 1000.)) + 30.;
+}
+
+
+/* Most common case: 16-bit unsigned integer in a certain unit */
+#define A2_OFFSET_TO_U16(offset) \
+ (id[SFF_A2_BASE + (offset)] << 8 | id[SFF_A2_BASE + (offset) + 1])
+
+/* Calibration slope is a number between 0.0 included and 256.0 excluded. */
+#define A2_OFFSET_TO_SLP(offset) \
+ (id[SFF_A2_BASE + (offset)] + id[SFF_A2_BASE + (offset) + 1] / 256.)
+
+/* Calibration offset is an integer from -32768 to 32767 */
+#define A2_OFFSET_TO_OFF(offset) \
+ ((__s16)A2_OFFSET_TO_U16(offset))
+
+/* RXPWR(x) are IEEE-754 floating point numbers in big-endian format */
+#define A2_OFFSET_TO_RXPWRx(offset) \
+ (befloattoh((__u32 *)(id + SFF_A2_BASE + (offset))))
+
+/*
+ * 2-byte internal temperature conversions:
+ * First byte is a signed 8-bit integer, which is the temp decimal part
+ * Second byte are 1/256th of degree, which are added to the dec part.
+ */
+#define A2_OFFSET_TO_TEMP(offset) ((__s16)A2_OFFSET_TO_U16(offset))
+
+
+static void sff8472_dom_parse(const __u8 *id, struct sff8472_diags *sd)
+{
+
+ sd->bias_cur[MCURR] = A2_OFFSET_TO_U16(SFF_A2_BIAS);
+ sd->bias_cur[HALRM] = A2_OFFSET_TO_U16(SFF_A2_BIAS_HALRM);
+ sd->bias_cur[LALRM] = A2_OFFSET_TO_U16(SFF_A2_BIAS_LALRM);
+ sd->bias_cur[HWARN] = A2_OFFSET_TO_U16(SFF_A2_BIAS_HWARN);
+ sd->bias_cur[LWARN] = A2_OFFSET_TO_U16(SFF_A2_BIAS_LWARN);
+
+ sd->sfp_voltage[MCURR] = A2_OFFSET_TO_U16(SFF_A2_VCC);
+ sd->sfp_voltage[HALRM] = A2_OFFSET_TO_U16(SFF_A2_VCC_HALRM);
+ sd->sfp_voltage[LALRM] = A2_OFFSET_TO_U16(SFF_A2_VCC_LALRM);
+ sd->sfp_voltage[HWARN] = A2_OFFSET_TO_U16(SFF_A2_VCC_HWARN);
+ sd->sfp_voltage[LWARN] = A2_OFFSET_TO_U16(SFF_A2_VCC_LWARN);
+
+ sd->tx_power[MCURR] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR);
+ sd->tx_power[HALRM] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_HALRM);
+ sd->tx_power[LALRM] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_LALRM);
+ sd->tx_power[HWARN] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_HWARN);
+ sd->tx_power[LWARN] = A2_OFFSET_TO_U16(SFF_A2_TX_PWR_LWARN);
+
+ sd->rx_power[MCURR] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR);
+ sd->rx_power[HALRM] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_HALRM);
+ sd->rx_power[LALRM] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_LALRM);
+ sd->rx_power[HWARN] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_HWARN);
+ sd->rx_power[LWARN] = A2_OFFSET_TO_U16(SFF_A2_RX_PWR_LWARN);
+
+ sd->sfp_temp[MCURR] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP);
+ sd->sfp_temp[HALRM] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_HALRM);
+ sd->sfp_temp[LALRM] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_LALRM);
+ sd->sfp_temp[HWARN] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_HWARN);
+ sd->sfp_temp[LWARN] = A2_OFFSET_TO_TEMP(SFF_A2_TEMP_LWARN);
+
+}
+
+/* Converts to a float from a big-endian 4-byte source buffer. */
+static float befloattoh(const __u32 *source)
+{
+ union {
+ __u32 src;
+ float dst;
+ } converter;
+
+ converter.src = ntohl(*source);
+ return converter.dst;
+}
+
+static void sff8472_calibration(const __u8 *id, struct sff8472_diags *sd)
+{
+ int i;
+ __u16 rx_reading;
+
+ /* Calibration should occur for all values (threshold and current) */
+ for (i = 0; i < sizeof(sd->bias_cur); ++i) {
+ /*
+ * Apply calibration formula 1 (Temp., Voltage, Bias, Tx Power)
+ */
+ sd->bias_cur[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_TXI_SLP);
+ sd->tx_power[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_TXPWR_SLP);
+ sd->sfp_voltage[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_V_SLP);
+ sd->sfp_temp[i] *= A2_OFFSET_TO_SLP(SFF_A2_CAL_T_SLP);
+
+ sd->bias_cur[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_TXI_OFF);
+ sd->tx_power[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_TXPWR_OFF);
+ sd->sfp_voltage[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_V_OFF);
+ sd->sfp_temp[i] += A2_OFFSET_TO_OFF(SFF_A2_CAL_T_OFF);
+
+ /*
+ * Apply calibration formula 2 (Rx Power only)
+ */
+ rx_reading = sd->rx_power[i];
+ sd->rx_power[i] = A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR0);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR1);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR2);
+ sd->rx_power[i] += rx_reading *
+ A2_OFFSET_TO_RXPWRx(SFF_A2_CAL_RXPWR3);
+ }
+}
+
+static void sff8472_parse_eeprom(const __u8 *id, struct sff8472_diags *sd)
+{
+ sd->supports_dom = id[SFF_A0_DOM] & SFF_A0_DOM_IMPL;
+ sd->supports_alarms = id[SFF_A0_OPTIONS] & SFF_A0_OPTIONS_AW;
+ sd->calibrated_ext = id[SFF_A0_DOM] & SFF_A0_DOM_EXTCAL;
+ sd->rx_power_type = id[SFF_A0_DOM] & SFF_A0_DOM_PWRT;
+
+ sff8472_dom_parse(id, sd);
+
+ /*
+ * If the SFP is externally calibrated, we need to read calibration data
+ * and compensate the already stored readings.
+ */
+ if (sd->calibrated_ext)
+ sff8472_calibration(id, sd);
+}
+
+void sff8472_show_all(const __u8 *id)
+{
+ struct sff8472_diags sd;
+ char *rx_power_string = NULL;
+ int i;
+
+ sff8472_parse_eeprom(id, &sd);
+
+ if (!sd.supports_dom) {
+ printf("\t%-41s : No\n", "Optical diagnostics support");
+ return ;
+ }
+ printf("\t%-41s : Yes\n", "Optical diagnostics support");
+
+#define PRINT_BIAS(string, index) \
+ printf("\t%-41s : %.3f mA\n", (string), \
+ (double)(sd.bias_cur[(index)] / 500.))
+
+# define PRINT_xX_PWR(string, var, index) \
+ printf("\t%-41s : %.4f mW / %.2f dBm\n", (string), \
+ (double)((var)[(index)] / 10000.), \
+ convert_mw_to_dbm((double)((var)[(index)] / 10000.)))
+
+#define PRINT_TEMP(string, index) \
+ printf("\t%-41s : %.2f degrees C / %.2f degrees F\n", (string), \
+ (double)(sd.sfp_temp[(index)] / 256.), \
+ (double)(sd.sfp_temp[(index)] / 256. * 1.8 + 32.))
+
+#define PRINT_VCC(string, index) \
+ printf("\t%-41s : %.4f V\n", (string), \
+ (double)(sd.sfp_voltage[(index)] / 10000.))
+
+ PRINT_BIAS("Laser bias current", MCURR);
+ PRINT_xX_PWR("Laser output power", sd.tx_power, MCURR);
+
+ if (!sd.rx_power_type)
+ rx_power_string = "Receiver signal OMA";
+ else
+ rx_power_string = "Receiver signal average optical power";
+
+ PRINT_xX_PWR(rx_power_string, sd.rx_power, MCURR);
+
+ PRINT_TEMP("Module temperature", MCURR);
+ PRINT_VCC("Module voltage", MCURR);
+
+ printf("\t%-41s : %s\n", "Alarm/warning flags implemented",
+ (sd.supports_alarms ? "Yes" : "No"));
+ if (sd.supports_alarms) {
+
+ for (i = 0; sff8472_aw_flags[i].str; ++i) {
+ printf("\t%-41s : %s\n", sff8472_aw_flags[i].str,
+ id[SFF_A2_BASE + sff8472_aw_flags[i].offset]
+ & sff8472_aw_flags[i].value ? "On" : "Off");
+ }
+
+ PRINT_BIAS("Laser bias current high alarm threshold", HALRM);
+ PRINT_BIAS("Laser bias current low alarm threshold", LALRM);
+ PRINT_BIAS("Laser bias current high warning threshold", HWARN);
+ PRINT_BIAS("Laser bias current low warning threshold", LWARN);
+
+ PRINT_xX_PWR("Laser output power high alarm threshold",
+ sd.tx_power, HALRM);
+ PRINT_xX_PWR("Laser output power low alarm threshold",
+ sd.tx_power, LALRM);
+ PRINT_xX_PWR("Laser output power high warning threshold",
+ sd.tx_power, HWARN);
+ PRINT_xX_PWR("Laser output power low warning threshold",
+ sd.tx_power, LWARN);
+
+ PRINT_TEMP("Module temperature high alarm threshold", HALRM);
+ PRINT_TEMP("Module temperature low alarm threshold", LALRM);
+ PRINT_TEMP("Module temperature high warning threshold", HWARN);
+ PRINT_TEMP("Module temperature low warning threshold", LWARN);
+
+ PRINT_VCC("Module voltage high alarm threshold", HALRM);
+ PRINT_VCC("Module voltage low alarm threshold", LALRM);
+ PRINT_VCC("Module voltage high warning threshold", HWARN);
+ PRINT_VCC("Module voltage low warning threshold", LWARN);
+
+ PRINT_xX_PWR("Laser rx power high alarm threshold",
+ sd.rx_power, HALRM);
+ PRINT_xX_PWR("Laser rx power low alarm threshold",
+ sd.rx_power, LALRM);
+ PRINT_xX_PWR("Laser rx power high warning threshold",
+ sd.rx_power, HWARN);
+ PRINT_xX_PWR("Laser rx power low warning threshold",
+ sd.rx_power, LWARN);
+ }
+
+}
+
diff --git a/sfpid.c b/sfpid.c
index a4a671d..4f88aa2 100644
--- a/sfpid.c
+++ b/sfpid.c
@@ -12,7 +12,7 @@
static void sff8079_show_identifier(const __u8 *id)
{
- printf("\tIdentifier : 0x%02x", id[0]);
+ printf("\t%-41s : 0x%02x", "Identifier", id[0]);
switch (id[0]) {
case 0x00:
printf(" (no module present, unknown, or unspecified)\n");
@@ -34,7 +34,7 @@ static void sff8079_show_identifier(const __u8 *id)
static void sff8079_show_ext_identifier(const __u8 *id)
{
- printf("\tExtended identifier : 0x%02x", id[1]);
+ printf("\t%-41s : 0x%02x", "Extended identifier", id[1]);
if (id[1] == 0x00)
printf(" (GBIC not specified / not MOD_DEF compliant)\n");
else if (id[1] == 0x04)
@@ -47,7 +47,7 @@ static void sff8079_show_ext_identifier(const __u8 *id)
static void sff8079_show_connector(const __u8 *id)
{
- printf("\tConnector : 0x%02x", id[2]);
+ printf("\t%-41s : 0x%02x", "Connector", id[2]);
switch (id[2]) {
case 0x00:
printf(" (unknown or unspecified)\n");
@@ -105,10 +105,12 @@ static void sff8079_show_connector(const __u8 *id)
static void sff8079_show_transceiver(const __u8 *id)
{
- static const char *pfx = "\t : =>";
+ static const char *pfx =
+ "\tTransceiver type :";
- printf("\tTransceiver codes : 0x%02x 0x%02x 0x%02x" \
+ printf("\t%-41s : 0x%02x 0x%02x 0x%02x " \
"0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
+ "Transceiver codes",
id[3], id[4], id[5], id[6],
id[7], id[8], id[9], id[10]);
/* 10G Ethernet Compliance Codes */
@@ -239,7 +241,7 @@ static void sff8079_show_transceiver(const __u8 *id)
static void sff8079_show_encoding(const __u8 *id)
{
- printf("\tEncoding : 0x%02x", id[11]);
+ printf("\t%-41s : 0x%02x", "Encoding", id[11]);
switch (id[11]) {
case 0x00:
printf(" (unspecified)\n");
@@ -270,7 +272,7 @@ static void sff8079_show_encoding(const __u8 *id)
static void sff8079_show_rate_identifier(const __u8 *id)
{
- printf("\tRate identifier : 0x%02x", id[13]);
+ printf("\t%-41s : 0x%02x", "Rate identifier", id[13]);
switch (id[13]) {
case 0x00:
printf(" (unspecified)\n");
@@ -295,14 +297,14 @@ static void sff8079_show_rate_identifier(const __u8 *id)
static void sff8079_show_oui(const __u8 *id)
{
- printf("\tVendor OUI : %02x:%02x:%02x\n",
+ printf("\t%-41s : %02x:%02x:%02x\n", "Vendor OUI",
id[37], id[38], id[39]);
}
static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
{
if (id[8] & (1 << 2)) {
- printf("\tPassive Cu cmplnce. : 0x%02x", id[60]);
+ printf("\t%-41s : 0x%02x", "Passive Cu cmplnce.", id[60]);
switch (id[60]) {
case 0x00:
printf(" (unspecified)");
@@ -316,7 +318,7 @@ static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
}
printf(" [SFF-8472 rev10.4 only]\n");
} else if (id[8] & (1 << 3)) {
- printf("\tActive Cu cmplnce. : 0x%02x", id[60]);
+ printf("\t%-41s : 0x%02x", "Active Cu cmplnce.", id[60]);
switch (id[60]) {
case 0x00:
printf(" (unspecified)");
@@ -333,7 +335,7 @@ static void sff8079_show_wavelength_or_copper_compliance(const __u8 *id)
}
printf(" [SFF-8472 rev10.4 only]\n");
} else {
- printf("\tLaser wavelength : %unm\n",
+ printf("\t%-41s : %unm\n", "Laser wavelength",
(id[60] << 8) | id[61]);
}
}
@@ -344,7 +346,7 @@ static void sff8079_show_value_with_unit(const __u8 *id, unsigned int reg,
{
unsigned int val = id[reg];
- printf("\t%-20s: %u%s\n", name, val * mult, unit);
+ printf("\t%-41s : %u%s\n", name, val * mult, unit);
}
static void sff8079_show_ascii(const __u8 *id, unsigned int first_reg,
@@ -352,7 +354,7 @@ static void sff8079_show_ascii(const __u8 *id, unsigned int first_reg,
{
unsigned int reg, val;
- printf("\t%-20s: ", name);
+ printf("\t%-41s : ", name);
for (reg = first_reg; reg <= last_reg; reg++) {
val = id[reg];
putchar(((val >= 32) && (val <= 126)) ? val : '_');
diff --git a/test-cmdline.c b/test-cmdline.c
index 85b4ce0..f1d4555 100644
--- a/test-cmdline.c
+++ b/test-cmdline.c
@@ -213,6 +213,8 @@ static struct test_case {
{ 0, "-m devname" },
{ 1, "--dump-module-eeprom" },
{ 0, "--dump-module-eeprom devname" },
+ { 1, "--module-info" },
+ { 0, "--module-info devname" },
{ 0, "-m devname raw on" },
{ 0, "-m devname raw off" },
{ 0, "-m devname hex on" },
--
1.7.0.4
^ permalink raw reply related
* Re: [PATCHv4] virtio-spec: virtio network device RFS support
From: Rusty Russell @ 2012-12-02 22:46 UTC (permalink / raw)
To: Michael S. Tsirkin, Jason Wang; +Cc: netdev, kvm, virtualization
In-Reply-To: <20121122144645.GA28284@redhat.com>
"Michael S. Tsirkin" <mst@redhat.com> writes:
> Add RFS support to virtio network device.
> Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
> configuration field max_virtqueue_pairs to detect supported number of
> virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
> packet steering for unidirectional protocols.
Hi Michael,
Sorry for the delay, I took last week off.
> - rename multiqueue -> rfs this is what we support
> - Be more explicit about what driver should do.
> - Simplify layout making VQs functionality depend on feature.
> - Remove unused commands, only leave in programming # of queues
Thanks: this looks really nice now. Comments are about the text, not
the ideas.
> + 2N+1: transmitqN.
> + 2N+
> +\change_unchanged
> +2:controlq
> \begin_inset Foot
> status open
Hmm, controlq after xmit queues... a nice improvement.
> +VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.
I think readers would prefer numerical order to historical order here,
so perhaps move this up in the list.
> -layout Two configuration fields are currently defined.
> +layout
> +\change_deleted 1986246365 1352743300
> +Two
> +\change_inserted 1986246365 1352743301
> +Four
> +\change_unchanged
> + configuration fields are currently defined.
two to four? I only see three? And you didn't update the structure to
match...
> + Following this, driver should not transmit new packets on virtqueues other
> + than transmitq0 and device will not steer new packets on virtqueues other
> + than receiveq0.
"Following this" is vague. After the buffer is consumed by the device.
Should not is kind of meaningless. Let's make it clear: the device will
not steer new packets to RxqN, nor read from TxqN.
You should probably put in a note about the RFS control in the Device
Initialization section, too, ie. if you have negotiated and want to use
more queues, you must initialize them then wait for the ack of the
CTRL_RFS cmd.
Note: the following hunks didn't apply, but I'm not sure why they're in
this anyway...
> @@ -6152,13 +6385,7 @@ Virtqueues 0:receiveq(port0).
> status open
>
> \begin_layout Plain Layout
> -Ports
> -\change_inserted 1986246365 1347188327
> -1
> -\change_deleted 1986246365 1347188327
> -2
> -\change_unchanged
> - onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
> +Ports 12 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
> \end_layout
>
> \end_inset
> @@ -6185,13 +6412,8 @@ VIRTIO_CONSOLE_F_SIZE
>
> \begin_layout Description
> VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati
> -on fields nr_ports and max_nr_ports are valid
> -\change_inserted 1986246365 1347188404
> -; if this bit is negotiated,
> -\change_deleted 1986246365 1347188406
> - and
> -\change_unchanged
> - control virtqueues will be used.
> +on fields nr_ports and max_nr_ports are valid; if this bit is negotiated,
> + and control virtqueues will be used.
> \end_layout
>
> \end_deeper
> @@ -6260,8 +6482,7 @@ If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can
> spawn multiple ports, not all of which may be attached to a console.
> Some could be generic ports.
> In this case, the control virtqueues are enabled and according to the max_nr_po
> -rts configuration-space value, an appropriate number of virtqueues are
> - created.
> +rts configuration-space value, an appropriate number of virtqueues are created.
> A control message indicating the driver is ready is sent to the host.
> The host can then send control messages for adding new ports to the device.
> After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
> @@ -6699,14 +6920,9 @@ The driver constructs an array of addresses of memory pages it has previously
> \end_layout
>
> \begin_layout Enumerate
> -If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is
> -\change_inserted 1986246365 1347188540
> -negotiated
> -\change_deleted 1986246365 1347188542
> -set
> -\change_unchanged
> -, the guest may not use these requested pages until that descriptor in the
> - deflateq has been used by the device.
> +If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is negotiatedset, the guest
> + may not use these requested pages until that descriptor in the deflateq
> + has been used by the device.
> \end_layout
>
> \begin_layout Enumerate
Cheers,
Rusty.
^ permalink raw reply
* Re: [GIT] Networking
From: Linus Torvalds @ 2012-12-03 0:13 UTC (permalink / raw)
To: David Miller
Cc: Andrew Morton, Network Development, Linux Kernel Mailing List
In-Reply-To: <20121128.214732.1634269294133625782.davem@davemloft.net>
David, Willy pointed me to the recent splice crash fix
(do_tcp_sendpages and non-0-order pages). It's apparently easily
user-triggerable.. Should I take the patch directly, or do you have a
tree to pull. Don't want to make a release with a known oopser..
Linus
^ permalink raw reply
* Re: [GIT] Networking
From: David Miller @ 2012-12-03 0:32 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
In-Reply-To: <CA+55aFy1sa=D-DrWqNuvjrLW8J0Tw1+GbZhXv6VHTTTxXG_DMA@mail.gmail.com>
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 2 Dec 2012 16:13:30 -0800
> David, Willy pointed me to the recent splice crash fix
> (do_tcp_sendpages and non-0-order pages). It's apparently easily
> user-triggerable.. Should I take the patch directly, or do you have a
> tree to pull. Don't want to make a release with a known oopser..
I have a tree to pull. Coming in a few minutes.
^ permalink raw reply
* [GIT] Networking
From: David Miller @ 2012-12-03 0:36 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
1) 8139cp leaks memory in error paths, from Francois Romieu.
2) do_tcp_sendpages() cannot handle order > 0 pages, but they can
certainly arrive there now, fix from Eric Dumazet.
3) Race condition and sysfs fixes in bonding from Nikolay Aleksandrov.
4) Remain-on-Channel fix in mac80211 from Felix Liao.
5) CCK rate calculation fix in iwlwifi, from Emmanuel Grumbach.
Please pull, thanks a lot!
The following changes since commit e9296e89b85604862bd9ec2d54dc43edad775c0d:
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2012-11-28 21:54:07 -0800)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net master
for you to fetch changes up to 892a925e42adb8192a3c832ad29cbc780fc466f6:
8139cp: fix coherent mapping leak in error path. (2012-12-01 20:39:17 -0500)
----------------------------------------------------------------
Emmanuel Grumbach (1):
iwlwifi: fix the basic CCK rates calculation
Eric Dumazet (1):
tcp: fix crashes in do_tcp_sendpages()
Johannes Berg (1):
mac80211: fix remain-on-channel (non-)cancelling
John W. Linville (2):
Merge branch 'for-john' of git://git.kernel.org/.../iwlwifi/iwlwifi-fixes
Merge branch 'master' of git://git.kernel.org/.../linville/wireless into for-davem
françois romieu (1):
8139cp: fix coherent mapping leak in error path.
nikolay@redhat.com (3):
bonding: fix miimon and arp_interval delayed work race conditions
bonding: make arp_ip_target parameter checks consistent with sysfs
bonding: fix race condition in bonding_store_slaves_active
drivers/net/bonding/bond_main.c | 93 +++++++++++++++++++++----------------------------------------------
drivers/net/bonding/bond_sysfs.c | 36 +++++++++-----------------
drivers/net/ethernet/realtek/8139cp.c | 11 +++++---
drivers/net/wireless/iwlwifi/dvm/rxon.c | 12 ++++-----
net/ipv4/tcp.c | 15 +++++------
net/mac80211/offchannel.c | 2 --
6 files changed, 61 insertions(+), 108 deletions(-)
^ permalink raw reply
* Re: [PATCH net-next v2 1/3] MAINTAINERS: Add Mellanox ethernet driver - mlx4_en
From: David Miller @ 2012-12-03 1:23 UTC (permalink / raw)
To: amirv; +Cc: bhutchings, ogerlitz, oren, netdev, yevgenyp
In-Reply-To: <1354456163-10497-2-git-send-email-amirv@mellanox.com>
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 2 Dec 2012 15:49:21 +0200
> Set mlx4_en maintainer to Amir Vadai instead of Yevgeny Petrilin.
>
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
> Cc: Yevgeny Petrilin <yevgenyp@mellanox.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next v2 2/3] net/mlx4_en: Fix TX moderation info loss after set_ringparam is called
From: David Miller @ 2012-12-03 1:23 UTC (permalink / raw)
To: amirv; +Cc: bhutchings, ogerlitz, oren, netdev
In-Reply-To: <1354456163-10497-3-git-send-email-amirv@mellanox.com>
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 2 Dec 2012 15:49:22 +0200
> We need to re-set tx moderation information after calling set_ringparam
> else default tx moderation will be used.
> Also avoid related code duplication, by putting it in a utility function.
>
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next v2 3/3] net/mlx4_en: Set number of rx/tx channels using ethtool
From: David Miller @ 2012-12-03 1:23 UTC (permalink / raw)
To: amirv; +Cc: bhutchings, ogerlitz, oren, netdev
In-Reply-To: <1354456163-10497-4-git-send-email-amirv@mellanox.com>
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 2 Dec 2012 15:49:23 +0200
> Add support to changing number of rx/tx channels using
> ethtool ('ethtool -[lL]'). Where the number of tx channels specified in ethtool
> is the number of rings per user priority - not total number of tx rings.
>
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next 00/13] bnx2x: net-next patch series
From: David Miller @ 2012-12-03 1:23 UTC (permalink / raw)
To: yuvalmin; +Cc: netdev, eilong, ariele
In-Reply-To: <1354457157-4730-1-git-send-email-yuvalmin@broadcom.com>
From: "Yuval Mintz" <yuvalmin@broadcom.com>
Date: Sun, 2 Dec 2012 16:05:44 +0200
> Hi Dave,
>
> This patch series contains several small changes to the bnx2x driver,
> including dcb changes, graceful error handling and benign error masking,
> and setting the driver's configuration according to management/nvram
> held values.
>
> Please consider applying these patches to 'net-next'.
Series applied.
^ permalink raw reply
* Re: [PATCH net-next] net: fix sparse endianness warnings on sock_common
From: David Miller @ 2012-12-03 1:23 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, fengguang.wu, ling.ma.program
In-Reply-To: <1354469590.20109.585.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 02 Dec 2012 09:33:10 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> # make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/inet_hashtables.o
> ...
> net/ipv4/inet_hashtables.c:242:7: warning: restricted __portpair degrades to integer
> net/ipv4/inet_hashtables.c:242:7: warning: restricted __addrpair degrades to integer
> ...
>
> Move __portpair/__addrpair from include/net/inet_hashtables.h
> to include/net/sock.h where we need them in struct sock_common
>
> Reported-by: Fengguang Wu <fengguang.wu@intel.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next] tcp: don't abort splice() after small transfers
From: David Miller @ 2012-12-03 1:24 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, w
In-Reply-To: <1354484967.20109.1167.camel@edumazet-glaptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 02 Dec 2012 13:49:27 -0800
> From: Willy Tarreau <w@1wt.eu>
>
> TCP coalescing added a regression in splice(socket->pipe) performance,
> for some workloads because of the way tcp_read_sock() is implemented.
>
> The reason for this is the break when (offset + 1 != skb->len).
>
> As we released the socket lock, this condition is possible if TCP stack
> added a fragment to the skb, which can happen with TCP coalescing.
>
> So let's go back to the beginning of the loop when this happens,
> to give a chance to splice more frags per system call.
>
> Doing so fixes the issue and makes GRO 10% faster than LRO
> on CPU-bound splice() workloads instead of the opposite.
>
> Signed-off-by: Willy Tarreau <w@1wt.eu>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied.
^ permalink raw reply
* Re: [PATCH 1/6] bna: remove useless calls to memset().
From: David Miller @ 2012-12-03 1:33 UTC (permalink / raw)
To: tipecaml; +Cc: linux-kernel, kernel-janitors, rmody, netdev
In-Reply-To: <1354416022-20189-2-git-send-email-tipecaml@gmail.com>
From: Cyril Roelandt <tipecaml@gmail.com>
Date: Sun, 2 Dec 2012 03:40:17 +0100
> These calls are followed by calls to memcpy() on the same memory area, so they
> can safely be removed.
>
> Signed-off-by: Cyril Roelandt <tipecaml@gmail.com>
Applied, thanks.
^ permalink raw reply
* Re: [net-next rfc v7 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info
From: Rusty Russell @ 2012-12-03 1:55 UTC (permalink / raw)
To: Jason Wang, mst, krkumar2, virtualization, netdev, linux-kernel
Cc: bhutchings, jwhan, shiyer, kvm
In-Reply-To: <1354011360-39479-2-git-send-email-jasowang@redhat.com>
Jason Wang <jasowang@redhat.com> writes:
> To support multiqueue transmitq/receiveq, the first step is to separate queue
> related structure from virtnet_info. This patch introduce send_queue and
> receive_queue structure and use the pointer to them as the parameter in
> functions handling sending/receiving.
OK, seems like a straightforward xform: a few nit-picks:
> +/* Internal representation of a receive virtqueue */
> +struct receive_queue {
> + /* Virtqueue associated with this receive_queue */
> + struct virtqueue *vq;
> +
> + struct napi_struct napi;
> +
> + /* Number of input buffers, and max we've ever had. */
> + unsigned int num, max;
Weird whitespace here.
> +
> + /* Work struct for refilling if we run low on memory. */
> + struct delayed_work refill;
I can't really see the justificaiton for a refill per queue. Just have
one work iterate all the queues if it happens, unless it happens often
(in which case, we need to look harder at this anyway).
> struct virtnet_info {
> struct virtio_device *vdev;
> - struct virtqueue *rvq, *svq, *cvq;
> + struct virtqueue *cvq;
> struct net_device *dev;
> struct napi_struct napi;
You leave napi here, and take it away in the next patch. I think it's
supposed to go away now.
Cheers,
Rusty.
^ permalink raw reply
* Re: [net-next rfc v7 2/3] virtio_net: multiqueue support
From: Rusty Russell @ 2012-12-03 2:04 UTC (permalink / raw)
To: Jason Wang, mst, krkumar2, virtualization, netdev, linux-kernel
Cc: bhutchings, jwhan, shiyer, kvm
In-Reply-To: <1354011360-39479-3-git-send-email-jasowang@redhat.com>
Jason Wang <jasowang@redhat.com> writes:
> +static const struct ethtool_ops virtnet_ethtool_ops;
> +
> +/*
> + * Converting between virtqueue no. and kernel tx/rx queue no.
> + * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
> + */
> +static int vq2txq(struct virtqueue *vq)
> +{
> + int index = virtqueue_get_queue_index(vq);
> + return index == 1 ? 0 : (index - 2) / 2;
> +}
> +
> +static int txq2vq(int txq)
> +{
> + return txq ? 2 * txq + 2 : 1;
> +}
> +
> +static int vq2rxq(struct virtqueue *vq)
> +{
> + int index = virtqueue_get_queue_index(vq);
> + return index ? (index - 1) / 2 : 0;
> +}
> +
> +static int rxq2vq(int rxq)
> +{
> + return rxq ? 2 * rxq + 1 : 0;
> +}
> +
I thought MST changed the proposed spec to make the control queue always
the last one, so this logic becomes trivial.
> +static int virtnet_set_queues(struct virtnet_info *vi)
> +{
> + struct scatterlist sg;
> + struct virtio_net_ctrl_rfs s;
> + struct net_device *dev = vi->dev;
> +
> + s.virtqueue_pairs = vi->curr_queue_pairs;
> + sg_init_one(&sg, &s, sizeof(s));
> +
> + if (!vi->has_cvq)
> + return -EINVAL;
> +
> + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
> + VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
> + dev_warn(&dev->dev, "Fail to set the number of queue pairs to"
> + " %d\n", vi->curr_queue_pairs);
> + return -EINVAL;
> + }
Where do we check the VIRTIO_NET_F_RFS bit?
> static int virtnet_probe(struct virtio_device *vdev)
> {
> - int err;
> + int i, err;
> struct net_device *dev;
> struct virtnet_info *vi;
> + u16 curr_queue_pairs;
> +
> + /* Find if host supports multiqueue virtio_net device */
> + err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
> + offsetof(struct virtio_net_config,
> + max_virtqueue_pairs), &curr_queue_pairs);
> +
> + /* We need at least 2 queue's */
> + if (err)
> + curr_queue_pairs = 1;
Huh? Just call this queue_pairs. It's not curr_ at all...
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
> + vi->has_cvq = true;
> +
> + /* Use single tx/rx queue pair as default */
> + vi->curr_queue_pairs = 1;
> + vi->max_queue_pairs = curr_queue_pairs;
See...
Cheers,
Rusty.
^ permalink raw reply
* [PATCH net-next] tuntap: attach queue 0 before registering netdevice
From: Jason Wang @ 2012-12-03 3:19 UTC (permalink / raw)
To: davem, netdev, linux-kernel, jslaby; +Cc: Jason Wang
We attach queue 0 after registering netdevice currently. This leads to call
netif_set_real_num_{tx|rx}_queues() after registering the netdevice. Since we
allow tun/tap has a maximum of 1024 queues, this may lead a huge number of
uevents to be injected to userspace since we create 2048 kobjects and then
remove 2046. Solve this problem by attaching queue 0 and set the real number of
queues before registering netdevice.
Reported-by: Jiri Slaby <jslaby@suse.cz>
Tested-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 11 +++++------
1 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b44d7b7..cc3f878 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -492,9 +492,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
tun_set_real_num_queues(tun);
- if (tun->numqueues == 1)
- netif_carrier_on(tun->dev);
-
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
@@ -1611,6 +1608,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
TUN_USER_FEATURES;
dev->features = dev->hw_features;
+ err = tun_attach(tun, file);
+ if (err < 0)
+ goto err_free_dev;
+
err = register_netdevice(tun->dev);
if (err < 0)
goto err_free_dev;
@@ -1620,9 +1621,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
device_create_file(&tun->dev->dev, &dev_attr_group))
pr_err("Failed to create tun sysfs files\n");
- err = tun_attach(tun, file);
- if (err < 0)
- goto err_free_dev;
+ netif_carrier_on(tun->dev);
}
tun_debug(KERN_INFO, tun, "tun_set_iff\n");
--
1.7.1
^ permalink raw reply related
* [PATCH 3/4 net-next] tg3: PTP - Add the hardware timestamp ioctl
From: Michael Chan @ 2012-12-03 3:42 UTC (permalink / raw)
To: davem; +Cc: netdev, nsujir
In-Reply-To: <1354506171-1646-2-git-send-email-mchan@broadcom.com>
From: Matt Carlson <mcarlson@broadcom.com>
This patch implements the SIOCSHWTSTAMP ioctl as described in
Documentation/networking/timestamping.txt
Signed-off-by: Nithin Nayak Sujir <nsujir@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/ethernet/broadcom/tg3.c | 99 +++++++++++++++++++++++++++++++++++
1 files changed, 99 insertions(+), 0 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index a54d194..f6e956c 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12755,6 +12755,102 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest,
}
+static int tg3_hwtstamp_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
+{
+ struct tg3 *tp = netdev_priv(dev);
+ struct hwtstamp_config stmpconf;
+
+ if (!tg3_flag(tp, PTP_CAPABLE))
+ return -EINVAL;
+
+ if (copy_from_user(&stmpconf, ifr->ifr_data, sizeof(stmpconf)))
+ return -EFAULT;
+
+ if (stmpconf.flags)
+ return -EINVAL;
+
+ switch (stmpconf.tx_type) {
+ case HWTSTAMP_TX_ON:
+ tg3_flag_set(tp, TX_TSTAMP_EN);
+ break;
+ case HWTSTAMP_TX_OFF:
+ tg3_flag_clear(tp, TX_TSTAMP_EN);
+ break;
+ default:
+ return -ERANGE;
+ }
+
+ switch (stmpconf.rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ tp->rxptpctl = 0;
+ break;
+ case HWTSTAMP_FILTER_ALL:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V1_EN |
+ TG3_RX_PTP_CTL_ALL_V1_EVENTS |
+ TG3_RX_PTP_CTL_RX_PTP_V2_EN |
+ TG3_RX_PTP_CTL_ALL_V2_EVENTS;
+ break;
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V1_EN |
+ TG3_RX_PTP_CTL_ALL_V1_EVENTS;
+ break;
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V1_EN |
+ TG3_RX_PTP_CTL_SYNC_EVNT;
+ break;
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V1_EN |
+ TG3_RX_PTP_CTL_DELAY_REQ;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_EN |
+ TG3_RX_PTP_CTL_ALL_V2_EVENTS;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L2_EN |
+ TG3_RX_PTP_CTL_ALL_V2_EVENTS;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L4_EN |
+ TG3_RX_PTP_CTL_ALL_V2_EVENTS;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_EN |
+ TG3_RX_PTP_CTL_SYNC_EVNT;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L2_EN |
+ TG3_RX_PTP_CTL_SYNC_EVNT;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L4_EN |
+ TG3_RX_PTP_CTL_SYNC_EVNT;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_EN |
+ TG3_RX_PTP_CTL_DELAY_REQ;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L2_EN |
+ TG3_RX_PTP_CTL_DELAY_REQ;
+ break;
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ tp->rxptpctl = TG3_RX_PTP_CTL_RX_PTP_V2_L4_EN |
+ TG3_RX_PTP_CTL_DELAY_REQ;
+ break;
+ default:
+ return -ERANGE;
+ }
+
+ if (netif_running(dev) && tp->rxptpctl)
+ tw32(TG3_RX_PTP_CTL,
+ tp->rxptpctl | TG3_RX_PTP_CTL_HWTS_INTERLOCK);
+
+ return copy_to_user(ifr->ifr_data, &stmpconf, sizeof(stmpconf)) ?
+ -EFAULT : 0;
+}
+
static int tg3_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
struct mii_ioctl_data *data = if_mii(ifr);
@@ -12805,6 +12901,9 @@ static int tg3_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return err;
+ case SIOCSHWTSTAMP:
+ return tg3_hwtstamp_ioctl(dev, ifr, cmd);
+
default:
/* do nothing */
break;
--
1.7.1
^ permalink raw reply related
* [PATCH 4/4 net-next] tg3: PTP - Enable the timestamping feature in hardware and fill skb tx/rx timestamps
From: Michael Chan @ 2012-12-03 3:42 UTC (permalink / raw)
To: davem; +Cc: netdev, nsujir
In-Reply-To: <1354506171-1646-3-git-send-email-mchan@broadcom.com>
From: Matt Carlson <mcarlson@broadcom.com>
This patch implements the hardware timestamping as described in
Documentation/networking/timestamping.txt
Update version to 3.128.
Signed-off-by: Nithin Nayak Sujir <nsujir@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/ethernet/broadcom/tg3.c | 57 +++++++++++++++++++++++++++++++---
1 files changed, 52 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index f6e956c..b2ad1c4 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -93,10 +93,10 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits)
#define DRV_MODULE_NAME "tg3"
#define TG3_MAJ_NUM 3
-#define TG3_MIN_NUM 127
+#define TG3_MIN_NUM 128
#define DRV_MODULE_VERSION \
__stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
-#define DRV_MODULE_RELDATE "November 14, 2012"
+#define DRV_MODULE_RELDATE "December 02, 2012"
#define RESET_KIND_SHUTDOWN 0
#define RESET_KIND_INIT 1
@@ -5658,6 +5658,14 @@ static const struct ptp_clock_info tg3_ptp_caps = {
.enable = tg3_ptp_enable,
};
+static void tg3_hwclock_to_timestamp(struct tg3 *tp, u64 hwclock,
+ struct skb_shared_hwtstamps *timestamp)
+{
+ memset(timestamp, 0, sizeof(struct skb_shared_hwtstamps));
+ timestamp->hwtstamp = ns_to_ktime((hwclock & TG3_TSTAMP_MASK) +
+ tp->ptp_adjust);
+}
+
static void tg3_ptp_init(struct tg3 *tp)
{
if (!tg3_flag(tp, PTP_CAPABLE))
@@ -5871,6 +5879,16 @@ static void tg3_tx(struct tg3_napi *tnapi)
return;
}
+ if (tnapi->tx_ring[sw_idx].len_flags & TXD_FLAG_HWTSTAMP) {
+ struct skb_shared_hwtstamps timestamp;
+ u64 hwclock = tr32(TG3_TX_TSTAMP_LSB);
+ hwclock |= (u64)tr32(TG3_TX_TSTAMP_MSB) << 32;
+
+ tg3_hwclock_to_timestamp(tp, hwclock, ×tamp);
+
+ skb_tstamp_tx(skb, ×tamp);
+ }
+
pci_unmap_single(tp->pdev,
dma_unmap_addr(ri, mapping),
skb_headlen(skb),
@@ -6138,6 +6156,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
dma_addr_t dma_addr;
u32 opaque_key, desc_idx, *post_ptr;
u8 *data;
+ u64 tstamp = 0;
desc_idx = desc->opaque & RXD_OPAQUE_INDEX_MASK;
opaque_key = desc->opaque & RXD_OPAQUE_RING_MASK;
@@ -6172,6 +6191,14 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
len = ((desc->idx_len & RXD_LEN_MASK) >> RXD_LEN_SHIFT) -
ETH_FCS_LEN;
+ if ((desc->type_flags & RXD_FLAG_PTPSTAT_MASK) ==
+ RXD_FLAG_PTPSTAT_PTPV1 ||
+ (desc->type_flags & RXD_FLAG_PTPSTAT_MASK) ==
+ RXD_FLAG_PTPSTAT_PTPV2) {
+ tstamp = tr32(TG3_RX_TSTAMP_LSB);
+ tstamp |= (u64)tr32(TG3_RX_TSTAMP_MSB) << 32;
+ }
+
if (len > TG3_RX_COPY_THRESH(tp)) {
int skb_size;
unsigned int frag_size;
@@ -6215,6 +6242,10 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
}
skb_put(skb, len);
+ if (tstamp)
+ tg3_hwclock_to_timestamp(tp, tstamp,
+ skb_hwtstamps(skb));
+
if ((tp->dev->features & NETIF_F_RXCSUM) &&
(desc->type_flags & RXD_FLAG_TCPUDP_CSUM) &&
(((desc->ip_tcp_csum & RXD_TCPCSUM_MASK)
@@ -7271,6 +7302,12 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
vlan = vlan_tx_tag_get(skb);
}
+ if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
+ tg3_flag(tp, TX_TSTAMP_EN)) {
+ skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+ base_flags |= TXD_FLAG_HWTSTAMP;
+ }
+
len = skb_headlen(skb);
mapping = pci_map_single(tp->pdev, skb->data, len, PCI_DMA_TODEVICE);
@@ -9139,9 +9176,15 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
*/
tp->grc_mode |= GRC_MODE_NO_TX_PHDR_CSUM;
- tw32(GRC_MODE,
- tp->grc_mode |
- (GRC_MODE_IRQ_ON_MAC_ATTN | GRC_MODE_HOST_STACKUP));
+ val = GRC_MODE_IRQ_ON_MAC_ATTN | GRC_MODE_HOST_STACKUP;
+ if (tp->rxptpctl)
+ tw32(TG3_RX_PTP_CTL,
+ tp->rxptpctl | TG3_RX_PTP_CTL_HWTS_INTERLOCK);
+
+ if (tg3_flag(tp, PTP_CAPABLE))
+ val |= GRC_MODE_TIME_SYNC_ENABLE;
+
+ tw32(GRC_MODE, tp->grc_mode | val);
/* Setup the timer prescalar register. Clock is always 66Mhz. */
val = tr32(GRC_MISC_CFG);
@@ -16565,6 +16608,10 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
pci_set_drvdata(pdev, dev);
+ if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719 ||
+ GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5720)
+ tg3_flag_set(tp, PTP_CAPABLE);
+
if (tg3_flag(tp, 5717_PLUS)) {
/* Resume a low-power mode */
tg3_frob_aux_power(tp, false);
--
1.7.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox