Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 7/7] mlx4_en: Updated driver version and date
From: Yevgeny Petrilin @ 2009-10-01 14:34 UTC (permalink / raw)
  To: davem; +Cc: netdev


Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/mlx4_en.h |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h
index 8655624..89ca376 100644
--- a/drivers/net/mlx4/mlx4_en.h
+++ b/drivers/net/mlx4/mlx4_en.h
@@ -50,8 +50,8 @@
 #include "en_port.h"
 
 #define DRV_NAME	"mlx4_en"
-#define DRV_VERSION	"1.4.1.1"
-#define DRV_RELDATE	"June 2009"
+#define DRV_VERSION	"1.4.2.1"
+#define DRV_RELDATE	"Oct 2009"
 
 #define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
 
-- 
1.6.1.3



^ permalink raw reply related

* [PATCH 6/7] mlx4_en: performing CLOSE_PORT at the end of tear-down process
From: Yevgeny Petrilin @ 2009-10-01 14:34 UTC (permalink / raw)
  To: davem; +Cc: netdev

As required by ConnectX PRM.
Not doing it might cause races in the HW during tear down process. 

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/en_netdev.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 13fc6f0..b1e80d8 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -711,9 +711,8 @@ void mlx4_en_stop_port(struct net_device *dev)
 	netif_tx_stop_all_queues(dev);
 	netif_tx_unlock_bh(dev);
 
-	/* close port*/
+	/* Set port as not active */
 	priv->port_up = false;
-	mlx4_CLOSE_PORT(mdev->dev, priv->port);
 
 	/* Unregister Mac address for the port */
 	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
@@ -738,6 +737,9 @@ void mlx4_en_stop_port(struct net_device *dev)
 			msleep(1);
 		mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]);
 	}
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
 }
 
 static void mlx4_en_restart(struct work_struct *work)
-- 
1.6.1.3



^ permalink raw reply related

* [PATCH 5/7] mlx4_en: Setting dev->perm_addr field
From: Yevgeny Petrilin @ 2009-10-01 14:34 UTC (permalink / raw)
  To: davem; +Cc: netdev

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/en_netdev.c |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 922ae1f..13fc6f0 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -1030,9 +1030,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
 	/* Set defualt MAC */
 	dev->addr_len = ETH_ALEN;
-	for (i = 0; i < ETH_ALEN; i++)
-		dev->dev_addr[ETH_ALEN - 1 - i] =
-		(u8) (priv->mac >> (8 * i));
+	for (i = 0; i < ETH_ALEN; i++) {
+		dev->dev_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i));
+		dev->perm_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i));
+	}
 
 	/*
 	 * Set driver features
-- 
1.6.1.3



^ permalink raw reply related

* [PATCH 4/7] mlx4_en: Added self diagnostics test implementation
From: Yevgeny Petrilin @ 2009-10-01 14:33 UTC (permalink / raw)
  To: davem; +Cc: netdev

The test includes 5 tests:
1. Interrupt test: Executing commands and receiving command completion
   on all our interrupt vectors.
2. Link test: Verifying we are connected to valid link partner.
3. Speed test: Check that we negotiated link speed correctly.
4. Registers test: Activate HW health check command.
5. Loopback test: Send a packet on loopback interface and catch it on RX side.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/Makefile      |    2 +-
 drivers/net/mlx4/en_ethtool.c  |   79 ++++++++++++------
 drivers/net/mlx4/en_netdev.c   |    2 +-
 drivers/net/mlx4/en_port.c     |   32 +++++++
 drivers/net/mlx4/en_port.h     |   14 +++
 drivers/net/mlx4/en_rx.c       |    7 ++
 drivers/net/mlx4/en_selftest.c |  182 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/en_tx.c       |   16 ++++
 drivers/net/mlx4/mlx4_en.h     |   21 +++++-
 9 files changed, 327 insertions(+), 28 deletions(-)
 create mode 100644 drivers/net/mlx4/en_selftest.c

diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 1fd068e..d1aa45a 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -6,4 +6,4 @@ mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
 obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
 mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
-		en_resources.o en_netdev.o
+		en_resources.o en_netdev.o en_selftest.o
diff --git a/drivers/net/mlx4/en_ethtool.c b/drivers/net/mlx4/en_ethtool.c
index 86467b4..745a204 100644
--- a/drivers/net/mlx4/en_ethtool.c
+++ b/drivers/net/mlx4/en_ethtool.c
@@ -125,6 +125,14 @@ static const char main_strings[][ETH_GSTRING_LEN] = {
 #define NUM_MAIN_STATS	21
 #define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS)
 
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN] = {
+	"Interupt Test",
+	"Link Test",
+	"Speed Test",
+	"Register Test",
+	"Loopback Test",
+};
+
 static u32 mlx4_en_get_msglevel(struct net_device *dev)
 {
 	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
@@ -148,10 +156,15 @@ static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 
-	if (sset != ETH_SS_STATS)
+	switch (sset) {
+	case ETH_SS_STATS:
+		return NUM_ALL_STATS +
+			(priv->tx_ring_num + priv->rx_ring_num) * 2;
+	case ETH_SS_TEST:
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.loopback_support) * 2;
+	default:
 		return -EOPNOTSUPP;
-
-	return NUM_ALL_STATS + (priv->tx_ring_num + priv->rx_ring_num) * 2;
+	}
 }
 
 static void mlx4_en_get_ethtool_stats(struct net_device *dev,
@@ -183,6 +196,12 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
 
 }
 
+static void mlx4_en_self_test(struct net_device *dev,
+			      struct ethtool_test *etest, u64 *buf)
+{
+	mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
 static void mlx4_en_get_strings(struct net_device *dev,
 				uint32_t stringset, uint8_t *data)
 {
@@ -190,30 +209,39 @@ static void mlx4_en_get_strings(struct net_device *dev,
 	int index = 0;
 	int i;
 
-	if (stringset != ETH_SS_STATS)
-		return;
-
-	/* Add main counters */
-	for (i = 0; i < NUM_MAIN_STATS; i++)
-		strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]);
-	for (i = 0; i < NUM_PORT_STATS; i++)
-		strcpy(data + (index++) * ETH_GSTRING_LEN,
+	switch (stringset) {
+	case ETH_SS_TEST:
+		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		if (priv->mdev->dev->caps.loopback_support)
+			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		break;
+
+	case ETH_SS_STATS:
+		/* Add main counters */
+		for (i = 0; i < NUM_MAIN_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]);
+		for (i = 0; i < NUM_PORT_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN,
 			main_strings[i + NUM_MAIN_STATS]);
-	for (i = 0; i < priv->tx_ring_num; i++) {
-		sprintf(data + (index++) * ETH_GSTRING_LEN,
-			"tx%d_packets", i);
-		sprintf(data + (index++) * ETH_GSTRING_LEN,
-			"tx%d_bytes", i);
-	}
-	for (i = 0; i < priv->rx_ring_num; i++) {
-		sprintf(data + (index++) * ETH_GSTRING_LEN,
-			"rx%d_packets", i);
-		sprintf(data + (index++) * ETH_GSTRING_LEN,
-			"rx%d_bytes", i);
-	}
-	for (i = 0; i < NUM_PKT_STATS; i++)
-		strcpy(data + (index++) * ETH_GSTRING_LEN,
+		for (i = 0; i < priv->tx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_bytes", i);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_bytes", i);
+		}
+		for (i = 0; i < NUM_PKT_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN,
 			main_strings[i + NUM_MAIN_STATS + NUM_PORT_STATS]);
+		break;
+	}
 }
 
 static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
@@ -407,6 +435,7 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_strings = mlx4_en_get_strings,
 	.get_sset_count = mlx4_en_get_sset_count,
 	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.self_test = mlx4_en_self_test,
 	.get_wol = mlx4_en_get_wol,
 	.get_msglevel = mlx4_en_get_msglevel,
 	.set_msglevel = mlx4_en_set_msglevel,
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index c48b0f4..922ae1f 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -108,7 +108,7 @@ static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
 	mutex_unlock(&mdev->state_lock);
 }
 
-static u64 mlx4_en_mac_to_u64(u8 *addr)
+u64 mlx4_en_mac_to_u64(u8 *addr)
 {
 	u64 mac = 0;
 	int i;
diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c
index a29abe8..aa3ef2a 100644
--- a/drivers/net/mlx4/en_port.c
+++ b/drivers/net/mlx4/en_port.c
@@ -142,6 +142,38 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 	return err;
 }
 
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_query_port_context *qport_context;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_port_state *state = &priv->port_state;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, sizeof(*qport_context));
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+	qport_context = mailbox->buf;
+
+	/* This command is always accessed from Ethtool context
+	 * already synchronized, no need in locking */
+	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+	if ((qport_context->link_speed & MLX4_EN_SPEED_MASK) ==
+	    MLX4_EN_1G_SPEED)
+		state->link_speed = 1000;
+	else
+		state->link_speed = 10000;
+	state->transciver = qport_context->transceiver;
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 {
diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h
index e6477f1..f6511aa 100644
--- a/drivers/net/mlx4/en_port.h
+++ b/drivers/net/mlx4/en_port.h
@@ -84,6 +84,20 @@ enum {
 	MLX4_MCAST_ENABLE       = 2,
 };
 
+struct mlx4_en_query_port_context {
+	u8 link_up;
+#define MLX4_EN_LINK_UP_MASK	0x80
+	u8 reserved;
+	__be16 mtu;
+	u8 reserved2;
+	u8 link_speed;
+#define MLX4_EN_SPEED_MASK	0x3
+#define MLX4_EN_1G_SPEED	0x2
+	u16 reserved3[5];
+	__be64 mac;
+	u8 transceiver;
+};
+
 
 struct mlx4_en_stat_out_mbox {
 	/* Received frames with a length of 64 octets */
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index 03b781a..247a408 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -654,6 +654,13 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			goto next;
 		}
 
+		if (unlikely(priv->validate_loopback)) {
+			priv->loopback_ok =
+				!strcmp((skb->data + ETH_HLEN), "MLX4 Loopback");
+			dev_kfree_skb_any(skb);
+			goto next;
+		}
+
 		skb->ip_summed = ip_summed;
 		skb->protocol = eth_type_trans(skb, dev);
 		skb_record_rx_queue(skb, cq->ring);
diff --git a/drivers/net/mlx4/en_selftest.c b/drivers/net/mlx4/en_selftest.c
new file mode 100644
index 0000000..8e2042e
--- /dev/null
+++ b/drivers/net/mlx4/en_selftest.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_en.h"
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+	return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+	struct sk_buff *skb;
+	struct ethhdr *ethh;
+	unsigned char *packet;
+	unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+	unsigned int i;
+	int err;
+
+
+	/* build the pkt before xmit */
+	skb = netdev_alloc_skb(priv->dev,
+			MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+	if (!skb) {
+		en_err(priv, "failed to allocate skb for xmit\n");
+		return -ENOMEM;
+	}
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	ethh = (struct ethhdr *)skb_put(skb, sizeof(struct ethhdr));
+	packet	= (unsigned char *)skb_put(skb, packet_size);
+	memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+	memset(ethh->h_source, 0, ETH_ALEN);
+	ethh->h_proto = htons(ETH_P_ARP);
+	skb_set_mac_header(skb, 0);
+	for (i = 0; i < packet_size; ++i)	/* fill our packet */
+		sprintf(packet, "MLX4 Loopback");
+
+	/* xmit the pkt */
+	err = mlx4_en_xmit(skb, priv->dev);
+	return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+	u32 loopback_ok = 0;
+	int i;
+
+	priv->loopback_ok = 0;
+	priv->validate_loopback = 1;
+
+	/* xmit */
+	if (mlx4_en_test_loopback_xmit(priv)) {
+		en_err(priv, "Transmitting loopback packet failed\n");
+		goto mlx4_en_test_loopback_exit;
+	}
+
+	/* polling for result */
+	for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+		msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+		if (priv->loopback_ok) {
+			loopback_ok = 1;
+			break;
+		}
+	}
+	if (!loopback_ok)
+		en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+	priv->validate_loopback = 0;
+	return !loopback_ok;
+}
+
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+	if (priv->port_state.link_state == 1)
+		return 0;
+	else
+		return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	/* The device currently only supports 10G speed */
+	if (priv->port_state.link_speed != SPEED_10000)
+		return priv->port_state.link_speed;
+	return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *tx_ring;
+	int i, running;
+
+	memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+	if (*flags & ETH_TEST_FL_OFFLINE) {
+		/* disable the interface */
+		running = netif_running(dev);
+
+		if (running) {
+			netif_tx_disable(dev);
+			dev->trans_start = jiffies;
+		}
+retry_tx:
+		/* Wait untill all tx queues are empty.
+		 * there should not be any additional incoming traffic
+		 * since we turned the carrier off */
+		msleep(200);
+		for (i = 0; i < priv->tx_ring_num && running; i++) {
+			tx_ring = &priv->tx_ring[i];
+			if (tx_ring->prod != (tx_ring->cons + tx_ring->last_nr_txbb))
+				goto retry_tx;
+		}
+
+		if (priv->mdev->dev->caps.loopback_support) {
+			buf[3] = mlx4_en_test_registers(priv);
+			buf[4] = mlx4_en_test_loopback(priv);
+		}
+
+		if (running)
+			netif_tx_wake_all_queues(dev);
+
+	}
+	buf[0] = mlx4_test_interrupts(mdev->dev);
+	buf[1] = mlx4_en_test_link(priv);
+	buf[2] = mlx4_en_test_speed(priv);
+
+	for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+		if (buf[i])
+			*flags |= ETH_TEST_FL_FAILED;
+	}
+}
diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
index 8c72799..47de091 100644
--- a/drivers/net/mlx4/en_tx.c
+++ b/drivers/net/mlx4/en_tx.c
@@ -599,6 +599,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct mlx4_wqe_data_seg *data;
 	struct skb_frag_struct *frag;
 	struct mlx4_en_tx_info *tx_info;
+	struct ethhdr *ethh;
+	u64 mac;
+	u32 mac_l, mac_h;
 	int tx_ind = 0;
 	int nr_txbb;
 	int desc_size;
@@ -675,6 +678,19 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		priv->port_stats.tx_chksum_offload++;
 	}
 
+	if (unlikely(priv->validate_loopback)) {
+		/* Copy dst mac address to wqe */
+		skb_reset_mac_header(skb);
+		ethh = eth_hdr(skb);
+		if (ethh && ethh->h_dest) {
+			mac = mlx4_en_mac_to_u64(ethh->h_dest);
+			mac_h = (u32) ((mac & 0xffff00000000) >> 16);
+			mac_l = (u32) (mac & 0xffffffff);
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
+			tx_desc->ctrl.imm = cpu_to_be32(mac_l);
+		}
+	}
+
 	/* Handle LSO (TSO) packets */
 	if (lso_header_size) {
 		/* Mark opcode as LSO */
diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h
index 4376147..8655624 100644
--- a/drivers/net/mlx4/mlx4_en.h
+++ b/drivers/net/mlx4/mlx4_en.h
@@ -45,6 +45,7 @@
 #include <linux/mlx4/cq.h>
 #include <linux/mlx4/srq.h>
 #include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
 
 #include "en_port.h"
 
@@ -52,7 +53,6 @@
 #define DRV_VERSION	"1.4.1.1"
 #define DRV_RELDATE	"June 2009"
 
-
 #define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
 
 #define en_print(level, priv, format, arg...)			\
@@ -171,10 +171,14 @@ enum {
 
 #define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
 #define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
 
 #define MLX4_EN_MIN_MTU		46
 #define ETH_BCAST		0xffffffffffffULL
 
+#define MLX4_EN_LOOPBACK_RETRIES	5
+#define MLX4_EN_LOOPBACK_TIMEOUT	100
+
 #ifdef MLX4_EN_PERF_STAT
 /* Number of samples to 'average' */
 #define AVG_SIZE			128
@@ -389,6 +393,12 @@ struct mlx4_en_rss_context {
 	__be32 rss_key[10];
 };
 
+struct mlx4_en_port_state {
+	int link_state;
+	int link_speed;
+	int transciver;
+};
+
 struct mlx4_en_pkt_stats {
 	unsigned long broadcast;
 	unsigned long rx_prio[8];
@@ -437,6 +447,7 @@ struct mlx4_en_priv {
 	struct vlan_group *vlgrp;
 	struct net_device_stats stats;
 	struct net_device_stats ret_stats;
+	struct mlx4_en_port_state port_state;
 	spinlock_t stats_lock;
 
 	unsigned long last_moder_packets;
@@ -455,6 +466,8 @@ struct mlx4_en_priv {
 	u16 sample_interval;
 	u16 adaptive_rx_coal;
 	u32 msg_enable;
+	u32 loopback_ok;
+	u32 validate_loopback;
 
 	struct mlx4_hwq_resources res;
 	int link_state;
@@ -494,6 +507,7 @@ struct mlx4_en_priv {
 	struct mlx4_en_port_stats port_stats;
 	struct dev_mc_list *mc_list;
 	struct mlx4_en_stat_out_mbox hw_stats;
+
 };
 
 
@@ -562,6 +576,11 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 			   u8 promisc);
 
 int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#define MLX4_EN_NUM_SELF_TEST	5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+u64 mlx4_en_mac_to_u64(u8 *addr);
 
 /*
  * Globals
-- 
1.6.1.3



^ permalink raw reply related

* [PATCH 3/7] mlx4: Added HW_HEALTH_CHECK command opcode
From: Yevgeny Petrilin @ 2009-10-01 14:33 UTC (permalink / raw)
  To: davem; +Cc: netdev

When the command is executed, the Firmware checks HW state and configuration
registers and returns status.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 include/linux/mlx4/cmd.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 0f82293..78a1b96 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -56,6 +56,7 @@ enum {
 	MLX4_CMD_QUERY_HCA	 = 0xb,
 	MLX4_CMD_QUERY_PORT	 = 0x43,
 	MLX4_CMD_SENSE_PORT	 = 0x4d,
+	MLX4_CMD_HW_HEALTH_CHECK = 0x50,
 	MLX4_CMD_SET_PORT	 = 0xc,
 	MLX4_CMD_ACCESS_DDR	 = 0x2e,
 	MLX4_CMD_MAP_ICM	 = 0xffa,
-- 
1.6.1.3


^ permalink raw reply related

* [PATCH 2/7] mlx4: Query for loopback support
From: Yevgeny Petrilin @ 2009-10-01 14:33 UTC (permalink / raw)
  To: davem; +Cc: netdev

The fw reports whether loopback capabilities are enabled

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/fw.c       |    3 +++
 drivers/net/mlx4/fw.h       |    1 +
 drivers/net/mlx4/main.c     |    1 +
 include/linux/mlx4/device.h |    1 +
 4 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index cee199c..50f1ee8 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -176,6 +176,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
 #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET	0x43
 #define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
 #define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
@@ -266,6 +267,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->max_msg_sz = 1 << (field & 0x1f);
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
+	dev_cap->loopback_support = field & 0x1;
 	MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
 	dev_cap->reserved_uars = field >> 4;
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 526d7f3..2cc1ba5 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -74,6 +74,7 @@ struct mlx4_dev_cap {
 	u64 def_mac[MLX4_MAX_PORTS + 1];
 	u16 eth_mtu[MLX4_MAX_PORTS + 1];
 	u16 stat_rate_support;
+	int loopback_support;
 	u32 flags;
 	int reserved_uars;
 	int uar_size;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 348b09b..e291a5c 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -220,6 +220,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
 	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.loopback_support   = dev_cap->loopback_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
 	dev->caps.log_num_macs  = log_num_mac;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e27a68d..7a423e7 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -228,6 +228,7 @@ struct mlx4_caps {
 	u32			bmme_flags;
 	u32			reserved_lkey;
 	u16			stat_rate_support;
+	int			loopback_support;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
 	int			max_gso_sz;
 	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
-- 
1.6.1.3


^ permalink raw reply related

* [PATCH 1/7] mlx4: Added interrupts test support
From: Yevgeny Petrilin @ 2009-10-01 14:32 UTC (permalink / raw)
  To: davem; +Cc: netdev

A test that verifies that we can accept interrupts on all
the irq vectors of the device.
Interrupts are checked using the NOP command.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
---
 drivers/net/mlx4/eq.c       |   44 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mlx4/device.h |    1 +
 2 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index bffb799..81619a1 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -698,3 +698,47 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
 
 	kfree(priv->eq_table.uar_map);
 }
+
+/* A test that verifies that we can accept interrupts on all
+ * the irq vectors of the device.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupts(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err;
+
+	err = mlx4_NOP(dev);
+	/* When not in MSI_X, there is only one irq to check */
+	if (!(dev->flags & MLX4_FLAG_MSI_X))
+		return err;
+
+	/* A loop over all completion vectors, for each vector we will check
+	 * whether it works by mapping command completions to that vector
+	 * and performing a NOP command
+	 */
+	for (i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) {
+		/* Temporary use polling for command completions */
+		mlx4_cmd_use_polling(dev);
+
+		/* Map the new eq to handle all asyncronous events */
+		err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+				  priv->eq_table.eq[i].eqn);
+		if (err) {
+			mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+			mlx4_cmd_use_events(dev);
+			break;
+		}
+
+		/* Go back to using events */
+		mlx4_cmd_use_events(dev);
+		err = mlx4_NOP(dev);
+	}
+
+	/* Return to default */
+	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupts);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ce7cc6c..e27a68d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -480,4 +480,5 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 
+int mlx4_test_interrupts(struct mlx4_dev *dev);
 #endif /* MLX4_DEVICE_H */
-- 
1.6.1.3


^ permalink raw reply related

* [PATCH] TI DaVinci EMAC: Minor macro related updates
From: Chaithrika U S @ 2009-10-01 20:25 UTC (permalink / raw)
  To: netdev; +Cc: davem, davinci-linux-open-source, Chaithrika U S

Use BIT for macro definitions wherever possible, remove
unused and redundant macros.

Signed-off-by: Chaithrika U S <chaithrika@ti.com>
---
Applies to Linus' kernel tree

 drivers/net/davinci_emac.c |   26 +++++++++++---------------
 1 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/net/davinci_emac.c b/drivers/net/davinci_emac.c
index 65a2d0b..a421ec0 100644
--- a/drivers/net/davinci_emac.c
+++ b/drivers/net/davinci_emac.c
@@ -164,16 +164,14 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1";
 # define EMAC_MBP_MCASTCHAN(ch)		((ch) & 0x7)
 
 /* EMAC mac_control register */
-#define EMAC_MACCONTROL_TXPTYPE		(0x200)
-#define EMAC_MACCONTROL_TXPACEEN	(0x40)
-#define EMAC_MACCONTROL_MIIEN		(0x20)
-#define EMAC_MACCONTROL_GIGABITEN	(0x80)
-#define EMAC_MACCONTROL_GIGABITEN_SHIFT (7)
-#define EMAC_MACCONTROL_FULLDUPLEXEN	(0x1)
+#define EMAC_MACCONTROL_TXPTYPE		BIT(9)
+#define EMAC_MACCONTROL_TXPACEEN	BIT(6)
+#define EMAC_MACCONTROL_GMIIEN		BIT(5)
+#define EMAC_MACCONTROL_GIGABITEN	BIT(7)
+#define EMAC_MACCONTROL_FULLDUPLEXEN	BIT(0)
 #define EMAC_MACCONTROL_RMIISPEED_MASK	BIT(15)
 
 /* GIGABIT MODE related bits */
-#define EMAC_DM646X_MACCONTORL_GMIIEN	BIT(5)
 #define EMAC_DM646X_MACCONTORL_GIG	BIT(7)
 #define EMAC_DM646X_MACCONTORL_GIGFORCE	BIT(17)
 
@@ -192,10 +190,10 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1";
 #define EMAC_RX_BUFFER_OFFSET_MASK	(0xFFFF)
 
 /* MAC_IN_VECTOR (0x180) register bit fields */
-#define EMAC_DM644X_MAC_IN_VECTOR_HOST_INT	      (0x20000)
-#define EMAC_DM644X_MAC_IN_VECTOR_STATPEND_INT	      (0x10000)
-#define EMAC_DM644X_MAC_IN_VECTOR_RX_INT_VEC	      (0x0100)
-#define EMAC_DM644X_MAC_IN_VECTOR_TX_INT_VEC	      (0x01)
+#define EMAC_DM644X_MAC_IN_VECTOR_HOST_INT	BIT(17)
+#define EMAC_DM644X_MAC_IN_VECTOR_STATPEND_INT	BIT(16)
+#define EMAC_DM644X_MAC_IN_VECTOR_RX_INT_VEC	BIT(8)
+#define EMAC_DM644X_MAC_IN_VECTOR_TX_INT_VEC	BIT(0)
 
 /** NOTE:: For DM646x the IN_VECTOR has changed */
 #define EMAC_DM646X_MAC_IN_VECTOR_RX_INT_VEC	BIT(EMAC_DEF_RX_CH)
@@ -203,7 +201,6 @@ static const char emac_version_string[] = "TI DaVinci EMAC Linux v6.1";
 #define EMAC_DM646X_MAC_IN_VECTOR_HOST_INT	BIT(26)
 #define EMAC_DM646X_MAC_IN_VECTOR_STATPEND_INT	BIT(27)
 
-
 /* CPPI bit positions */
 #define EMAC_CPPI_SOP_BIT		BIT(31)
 #define EMAC_CPPI_EOP_BIT		BIT(30)
@@ -747,8 +744,7 @@ static void emac_update_phystatus(struct emac_priv *priv)
 
 	if (priv->speed == SPEED_1000 && (priv->version == EMAC_VERSION_2)) {
 		mac_control = emac_read(EMAC_MACCONTROL);
-		mac_control |= (EMAC_DM646X_MACCONTORL_GMIIEN |
-				EMAC_DM646X_MACCONTORL_GIG |
+		mac_control |= (EMAC_DM646X_MACCONTORL_GIG |
 				EMAC_DM646X_MACCONTORL_GIGFORCE);
 	} else {
 		/* Clear the GIG bit and GIGFORCE bit */
@@ -2105,7 +2101,7 @@ static int emac_hw_enable(struct emac_priv *priv)
 
 	/* Enable MII */
 	val = emac_read(EMAC_MACCONTROL);
-	val |= (EMAC_MACCONTROL_MIIEN);
+	val |= (EMAC_MACCONTROL_GMIIEN);
 	emac_write(EMAC_MACCONTROL, val);
 
 	/* Enable NAPI and interrupts */
-- 
1.5.6


^ permalink raw reply related

* [PATCH] skge: use unique IRQ name
From: Michal Schmidt @ 2009-10-01 10:27 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20090922092826.5302225c@s6510>

Most network drivers request their IRQ when the interface is activated.
skge does it in ->probe() instead, because it can work with two-port
cards where the two net_devices use the same IRQ. This works fine most
of the time, except in some situations when the interface gets renamed.
Consider this example:

1. modprobe skge
   The card is detected as eth0 and requests IRQ 17. Directory
   /proc/irq/17/eth0 is created.
2. There is an udev rule which says this interface should be called
   eth1, so udev renames eth0 -> eth1.
3. modprobe 8139too
   The Realtek card is detected as eth0. It will be using IRQ 17 too.
4. ip link set eth0 up
   Now 8139too requests IRQ 17.

The result is:
WARNING: at fs/proc/generic.c:590 proc_register ...
proc_dir_entry '17/eth0' already registered
...
And "ls /proc/irq/17" shows two subdirectories, both called eth0.

Fix it by using a unique name for skge's IRQ, based on the PCI address.
The naming from the example then looks like this:
$ grep skge /proc/interrupts
 17:        169   IO-APIC-fasteoi   skge@0000:00:0a.0, eth0

irqbalance daemon will have to be taught to recognize "skge@" as an
Ethernet interrupt. This will be a one-liner addition in classify.c. I
will send a patch to irqbalance if this change is accepted.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>

Index: kernel/drivers/net/skge.c
===================================================================
--- kernel.orig/drivers/net/skge.c
+++ kernel/drivers/net/skge.c
@@ -3895,6 +3895,7 @@ static int __devinit skge_probe(struct p
 	struct net_device *dev, *dev1;
 	struct skge_hw *hw;
 	int err, using_dac = 0;
+	size_t irq_name_len;

 	err = pci_enable_device(pdev);
 	if (err) {
@@ -3935,11 +3936,13 @@ static int __devinit skge_probe(struct p
 #endif

 	err = -ENOMEM;
-	hw = kzalloc(sizeof(*hw), GFP_KERNEL);
+	irq_name_len = strlen(DRV_NAME) + strlen(dev_name(&pdev->dev)) + 2;
+	hw = kzalloc(sizeof(*hw) + irq_name_len, GFP_KERNEL);
 	if (!hw) {
 		dev_err(&pdev->dev, "cannot allocate hardware struct\n");
 		goto err_out_free_regions;
 	}
+	sprintf(hw->irq_name, DRV_NAME "@%s", dev_name(&pdev->dev));

 	hw->pdev = pdev;
 	spin_lock_init(&hw->hw_lock);
@@ -3974,7 +3977,7 @@ static int __devinit skge_probe(struct p
 		goto err_out_free_netdev;
 	}

-	err = request_irq(pdev->irq, skge_intr, IRQF_SHARED, dev->name, hw);
+	err = request_irq(pdev->irq, skge_intr, IRQF_SHARED, hw->irq_name, hw);
 	if (err) {
 		dev_err(&pdev->dev, "%s: cannot assign irq %d\n",
 		       dev->name, pdev->irq);
Index: kernel/drivers/net/skge.h
===================================================================
--- kernel.orig/drivers/net/skge.h
+++ kernel/drivers/net/skge.h
@@ -2423,6 +2423,8 @@ struct skge_hw {
 	u16		     phy_addr;
 	spinlock_t	     phy_lock;
 	struct tasklet_struct phy_task;
+
+	char		     irq_name[0]; /* name for /proc/interrupts */
 };

 enum pause_control {

^ permalink raw reply

* Re: [PATCH] pktgen: Fix delay handling
From: Eric Dumazet @ 2009-10-01 10:04 UTC (permalink / raw)
  To: Stephen Hemminger, David S. Miller
  Cc: Jesper Dangaard Brouer, Robert Olsson, netdev
In-Reply-To: <4AC47AB6.9000501@gmail.com>

Eric Dumazet a écrit :
> After last pktgen changes, delay handling is wrong.
> 
> pktgen actually sends packets at full line speed.
> 
> Fix is to update pkt_dev->next_tx even if spin() returns early,
> so that next spin() calls have a chance to see a positive delay.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Oh well, I hit this bug on linux-2.6 git tree, but I did the patch on net-next-2.6

But it appears net/core/pktgen.c is different on net-next-2.6

Stephen, David, I am a bit lost here, something went wrong in a merge process ?

In any case, here is the patch against Linus tree, where bug is present.

Thanks

[PATCH] pktgen: Fix delay handling

After last pktgen changes, delay handling is wrong.

pktgen actually sends packets at full line speed.

Fix is to update pkt_dev->next_tx even if spin() returns early,
so that next spin() calls have a chance to see a positive delay.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4d11c28..b694552 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2105,15 +2105,17 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 {
 	ktime_t start_time, end_time;
-	s32 remaining;
+	s64 remaining;
 	struct hrtimer_sleeper t;

 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	hrtimer_set_expires(&t.timer, spin_until);

 	remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer));
-	if (remaining <= 0)
+	if (remaining <= 0) {
+		pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
 		return;
+	}

 	start_time = ktime_now();
 	if (remaining < 100)

^ permalink raw reply related

* [PATCH] gigaset/CAPI: accept any number type/plan
From: Tilman Schmidt @ 2009-10-01  9:53 UTC (permalink / raw)
  To: Karsten Keil, Karsten Keil
  Cc: Hansjoerg Lipp, davem, i4ldeveloper, netdev, linux-kernel

Be more liberal in accepting CAPI CONNECT_REQ message parameters
Called Party Number and Calling Party Number:
* Accept Numbering plan "ISDN/Telephony" as supported.
* Ignore unsupported values for Type of number, Numbering plan,
  Presentation indicator and Screening indicator with a warning
  instead of rejecting the entire request.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
A second small fix to the new Gigaset CAPI interface resulting from
testing with more applications. Please tell me if you'd prefer me
to reissue "[PATCH 12/12] gigaset: add Kernel CAPI interface" with
both fixes folded in.

 drivers/isdn/gigaset/capi.c |   29 ++++++++++++++++-------------
 1 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c
index 8afff37..c276a92 100644
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -1236,12 +1236,14 @@ static void do_connect_req(struct gigaset_capi_ctr *iif,
 		goto error;
 	}
 	l = *pp++;
-	/* check number type/numbering plan byte */
-	if (*pp != 0x80) {
+	/* check type of number/numbering plan byte */
+	switch (*pp) {
+	case 0x80:	/* unknown type / unknown numbering plan */
+	case 0x81:	/* unknown type / ISDN/Telephony numbering plan */
+		break;
+	default:	/* others: warn about potential misinterpretation */
 		dev_notice(cs->dev, "%s: %s type/plan 0x%02x unsupported\n",
 			   "CONNECT_REQ", "Called party number", *pp);
-		info = CapiIllMessageParmCoding;
-		goto error;
 	}
 	pp++;
 	l--;
@@ -1266,26 +1268,28 @@ static void do_connect_req(struct gigaset_capi_ctr *iif,
 	if (pp != NULL && *pp > 0) {
 		l = *pp++;
 
-		/* check number type/numbering plan byte */
-		if (*pp) {
-			/* ToDo: allow for Ext=1? */
+		/* check type of number/numbering plan byte */
+		/* ToDo: handle Ext=1? */
+		switch (*pp) {
+		case 0x00:	/* unknown type / unknown numbering plan */
+		case 0x01:	/* unknown type / ISDN/Telephony num. plan */
+			break;
+		default:
 			dev_notice(cs->dev,
 				   "%s: %s type/plan 0x%02x unsupported\n",
 				   "CONNECT_REQ", "Calling party number", *pp);
-			info = CapiIllMessageParmCoding;
-			goto error;
 		}
 		pp++;
 		l--;
 
-		/* check presentation/screening indicator */
+		/* check presentation indicator */
 		if (!l) {
 			dev_notice(cs->dev, "%s: %s IE truncated\n",
 				   "CONNECT_REQ", "Calling party number");
 			info = CapiIllMessageParmCoding;
 			goto error;
 		}
-		switch (*pp) {
+		switch (*pp & 0xfc) { /* ignore Screening indicator */
 		case 0x80:	/* Presentation allowed */
 			s = "^SCLIP=1\r";
 			break;
@@ -1297,8 +1301,7 @@ static void do_connect_req(struct gigaset_capi_ctr *iif,
 				   "CONNECT_REQ",
 				   "Presentation/Screening indicator",
 				   *pp);
-			info = CapiIllMessageParmCoding;
-			goto error;
+			s = "^SCLIP=1\r";
 		}
 		commands[AT_CLIP] = kstrdup(s, GFP_KERNEL);
 		if (!commands[AT_CLIP])
-- 
1.6.2.1.214.ge986c

^ permalink raw reply related

* Re: [RFCv4 PATCH 2/2] net: Allow protocols to provide an unlocked_recvmsg socket method
From: Nir Tzachar @ 2009-10-01  9:49 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo,
	Linux Networking Development Mailing List
  Cc: Ziv Ayalon
In-Reply-To: <20090923043813.GA6464@ghostprotocols.net>

Hi Arnaldo

I have repeated the tests using net-next on top of linus' git tree (I
hope I got it right..) and the patches you sent me. Things did not get
better, and in most cases were even worse; the recvmmsg parts
distinctly showed better throughput, but the latency has more than
doubled.

The simplest test of using a batch size of 1 results with recvmmsg's
latency over 1000 micro, while regular recvmsg is around 450 micro.
(note that to use 1 packet there is a small bug in the reg_recv which
needs to be fixed. Namely, change ret = -1 to ret = 0). On the
previous system config -- part 0001 of the patch, on top of 2.6.31 --
the latency of a single packet batch is 370 micro.

So, there seems to be a regression with the kernel tree I am using, or
with part 0002 of the path. I'll try running the net-next with only
part 1 of the patch and report.

Cheers.

^ permalink raw reply

* [PATCH] pktgen: Fix delay handling
From: Eric Dumazet @ 2009-10-01  9:47 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jesper Dangaard Brouer, Robert Olsson, netdev, David S. Miller
In-Reply-To: <20090930172532.2c2d1d42@s6510>

After last pktgen changes, delay handling is wrong.

pktgen actually sends packets at full line speed.

Fix is to update pkt_dev->next_tx even if spin() returns early,
so that next spin() calls have a chance to see a positive delay.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/pktgen.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0bcecbf..1a0682e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2106,15 +2106,17 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 {
 	ktime_t start;
-	s32 remaining;
+	s64 remaining;
 	struct hrtimer_sleeper t;

 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	hrtimer_set_expires(&t.timer, spin_until);

 	remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer));
-	if (remaining <= 0)
+	if (remaining <= 0) {
+		pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
 		return;
+	}

 	start = ktime_now();
 	if (remaining < 100)

^ permalink raw reply related

* [PATCHv2] IPv4 TCP fails to send window scale option when window scale is zero
From: Gilad Ben-Yossef @ 2009-10-01  9:39 UTC (permalink / raw)
  To: Netdev; +Cc: Ori Finkalman, Ilpo Järvinen, Eric Dumazet

From: Ori Finkelman <ori@comsleep.com>

Acknowledge TCP window scale support by inserting the proper option in 
SYN/ACK and SYN headers
even if our window scale is zero.

This fixes the following observed behavior:

1. Client sends a SYN with TCP window scaling option and non zero window 
scale value to a Linux box.
2. Linux box notes large receive window from client.
3. Linux decides on a zero value of window scale for its part.
4. Due to compare against requested window scale size option, Linux does 
not to send windows scale TCP option header on SYN/ACK at all.

With the following result:

Client box thinks TCP window scaling is not supported, since SYN/ACK had 
no TCP window scale option,
while Linux thinks that TCP window scaling is supported (and scale might 
be non zero), since SYN had TCP window scale option and we have a 
mismatched idea between the client and server regarding window sizes.

Probably it also fixes up the following bug (not observed in practice):

1. Linux box opens TCP connection to some server.
2. Linux decides on zero value of window scale.
3. Due to compare against computed window scale size option, Linux does 
not to set windows scale TCP option header on SYN.  

With the expected result that the server OS does not use window scale 
option due to not receiving such an option in the SYN headers, leading 
to suboptimal performance.

---

Original bug reported and patch written by Ori Finkelman from Comsleep 
Ltd. I've fixed the SYN header case based on feedback from Eric Dumazet 
and Ilpo Jarvinen, as part of trying to get the patch mainlined.

The SYN/ACK behavior was observed with a Windows box as the client and 
latest Debian kernel but for the best
of my understanding this can happen with latest kernel versions and 
other client OS (probably also Linux) as well.

The SYN/ACK scenario was tested on a x86 system. The SYN sceanrio was 
only compile tested.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5200aab..fcd278a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -361,6 +361,7 @@ static inline int tcp_urg_mode(const struct tcp_sock 
*tp)
 #define OPTION_SACK_ADVERTISE  (1 << 0)
 #define OPTION_TS              (1 << 1)
 #define OPTION_MD5             (1 << 2)
+#define OPTION_WSCALE          (1 << 3)

 struct tcp_out_options {
        u8 options;             /* bit field of OPTION_* */
@@ -427,7 +428,7 @@ static void tcp_options_write(__be32 *ptr, struct 
tcp_sock *tp,
                               TCPOLEN_SACK_PERM);
        }

-       if (unlikely(opts->ws)) {
+       if (unlikely(OPTION_WSCALE & opts->options)) {
                *ptr++ = htonl((TCPOPT_NOP << 24) |
                               (TCPOPT_WINDOW << 16) |
                               (TCPOLEN_WINDOW << 8) |
@@ -494,8 +495,8 @@ static unsigned tcp_syn_options(struct sock *sk, 
struct sk_buff *skb,
        }
        if (likely(sysctl_tcp_window_scaling)) {
                opts->ws = tp->rx_opt.rcv_wscale;
-               if (likely(opts->ws))
-                       size += TCPOLEN_WSCALE_ALIGNED;
+               opts->options |= OPTION_WSCALE;
+               size += TCPOLEN_WSCALE_ALIGNED;
        }
        if (likely(sysctl_tcp_sack)) {
                opts->options |= OPTION_SACK_ADVERTISE;
@@ -537,8 +538,8 @@ static unsigned tcp_synack_options(struct sock *sk,

        if (likely(ireq->wscale_ok)) {
                opts->ws = ireq->rcv_wscale;
-               if (likely(opts->ws))
-                       size += TCPOLEN_WSCALE_ALIGNED;
+               opts->options |= OPTION_WSCALE;
+               size += TCPOLEN_WSCALE_ALIGNED;
        }
        if (likely(doing_ts)) {
                opts->options |= OPTION_TS;

-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"Now the world has gone to bed
	 Darkness won't engulf my head
	 I can see by infra-red
	 How I hate the night."

^ permalink raw reply related

* Re: [PATCH] [RFC] IPv4 TCP fails to send window scale option when window scale is zero
From: Gilad Ben-Yossef @ 2009-10-01  9:39 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Ilpo Järvinen, Netdev, Ori Finkalman
In-Reply-To: <4AC357D3.7080606@gmail.com>

Eric Dumazet wrote:

>
>>>
>>> Your version slows down the tcp_options_write() function, once per tx packet.
>>>       
>> Are you serious that anding would cost that much? :-/
>>     
>
> Not really :)
>   
LOL I was trying very hard to understand why you thought this was such 
an issue. My head was flying into all sorts of weird directions like 
cache effects and the like... ;-)

<snip>
> Yes, wscale 0 is RFC valid, but are we sure some equipment wont play funny games
> with such value ? At least sending "wscale 1-14" must be working...
>   
Well, there at least used to be routers that would actually zeroed the 
WS value in transit while leaving the option set, but this is another 
issue of course.

Anyway, I know Vista at least does set the window scale TCP option by 
default. One assumes they occasionally send a zero value scale. Not that 
Vista is such a good benchmark to compare Linux to but at least I tend 
to believe the issue would have popped up if it is common enough.

I can craft a patch to introduce a route table option to set TCP window 
scale minimum and maximum sizes, similar to window size route option, if 
you there is a need for that. Personally, I think it is just overkill.
>
> My quick&dirty patch was only for discussion, I have no strong opinion on it,
> only that was on one place to patch instead of two/three/four I dont know yet.
>
> So please Gilad & Ori send us a new patch :)
>
>   
Revised patch follows in next email.

Gilad

   

-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"Now the world has gone to bed
	 Darkness won't engulf my head
	 I can see by infra-red
	 How I hate the night."


^ permalink raw reply

* Re: tg3: Badness at kernel/mutex.c:207
From: Felix Radensky @ 2009-10-01  9:36 UTC (permalink / raw)
  To: Matt Carlson; +Cc: netdev@vger.kernel.org
In-Reply-To: <20090928205128.GA12652@xw6200.broadcom.net>

Hi, Matt

Matt Carlson wrote:
> On Sat, Sep 26, 2009 at 02:20:57PM -0700, Felix Radensky wrote:
>   
>> Hi,
>>
>> I'm running linux-2.6.31 on a custom MPC8536 based board with BCM57760 chip.
>> Both tg3 driver, and Broadcom PHY driver are modules.
>>
>> Each time I run ifconfig eth2 up, I get the following error message:
>>
>> Badness at kernel/mutex.c:207
>> NIP: c025132c LR: c0251314 CTR: c0251334
>> REGS: efbedbd0 TRAP: 0700   Not tainted  (2.6.31)
>> MSR: 00029000 <EE,ME,CE>  CR: 24020422  XER: 00000000
>> TASK = efacce10[1080] 'ifconfig' THREAD: efbec000
>> GPR00: 00000000 efbedc80 efacce10 00000001 00007020 00000002 00000000 
>> 00000200
>> GPR08: 00029000 c0350000 c0330000 00000001 24020424 10057d94 000002a0 
>> 1000d82c
>> GPR16: 1000d81c 1000d814 10010000 10050000 ef897a0c efbede18 ffff8914 
>> ef897a00
>> GPR24: 00008000 c034b480 efbec000 efb0122c c0350000 efacce10 ef82d2c0 
>> efb01228
>> NIP [c025132c] __mutex_lock_slowpath+0x1f0/0x1f8
>> LR [c0251314] __mutex_lock_slowpath+0x1d8/0x1f8
>> Call Trace:
>> [efbedcd0] [c025134c] mutex_lock+0x18/0x34
>> [efbedcf0] [f534a228] tg3_chip_reset+0x7cc/0x9f8 [tg3]
>> [efbedd20] [f534a8f0] tg3_reset_hw+0x58/0x2360 [tg3]
>> [efbedd70] [f5351dd4] tg3_open+0x610/0x910 [tg3]
>> [efbeddb0] [c01e1c6c] dev_open+0x100/0x138
>> [efbeddd0] [c01dff20] dev_change_flags+0x80/0x1ac
>> [efbeddf0] [c02232cc] devinet_ioctl+0x648/0x824
>> [efbede60] [c0223de4] inet_ioctl+0xcc/0xf8
>> [efbede70] [c01cdf44] sock_ioctl+0x60/0x300
>> [efbede90] [c008a35c] vfs_ioctl+0x34/0x8c
>> [efbedea0] [c008a580] do_vfs_ioctl+0x88/0x724
>> [efbedf10] [c008ac5c] sys_ioctl+0x40/0x74
>> [efbedf40] [c000f814] ret_from_syscall+0x0/0x3c
>> Instruction dump:
>> 0fe00000 4bfffe80 801a000c 5409016f 4182fe60 4bf0f6d9 2f830000 41befe54
>> 3d20c035 8009c2c0 2f800000 40befe44 <0fe00000> 4bfffe3c 9421ffe0 7c0802a6
>>
>> Does it indicate a real problem, or something that can be ignored ?
>>
>> Additional information from kernel log:
>>
>> tg3.c:v3.99 (April 20, 2009)
>> tg3 0002:05:00.0: enabling bus mastering
>> tg3 0002:05:00.0: PME# disabled
>> tg3 mdio bus: probed
>> eth2: Tigon3 [partno(BCM57760) rev 57780001] (PCI Express) MAC address 
>> 00:10:18:00:00:00
>> eth2: attached PHY driver [Broadcom BCM57780] (mii_bus:phy_addr=500:01)
>> eth2: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[0] TSOcap[1]
>> eth2: dma_rwctrl[76180000] dma_mask[64-bit]
>> tg3 0002:05:00.0: PME# disabled
>>     
>
> Yes, this is a real problem.  The driver is taking the MDIO bus lock
> while holding the device's own spinlock.  I think I may have a
> workaround.  Let me test it and get back to you.
>   

Did you have a chance to look into it ?

Thanks.

Felix.


^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Michael S. Tsirkin @ 2009-10-01  9:28 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Gregory Haskins, Ira W. Snyder, netdev, virtualization, kvm,
	linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze,
	alacrityvm-devel
In-Reply-To: <4AC46989.7030502@redhat.com>

On Thu, Oct 01, 2009 at 10:34:17AM +0200, Avi Kivity wrote:
>> Second, I do not use ioeventfd anymore because it has too many problems
>> with the surrounding technology.  However, that is a topic for a
>> different thread.
>>    
>
> Please post your issues.  I see ioeventfd/irqfd as critical kvm interfaces.

I second that. AFAIK ioeventfd/irqfd got exposed to userspace in 2.6.32-rc1,
if there are issues we better nail them before 2.6.32 is out.
And yes, please start a different thread.

-- 
MST

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* tg3: bug report, driver freeze (transmit timed out), ifdown+ifup makes interface work again
From: Jesper Dangaard Brouer @ 2009-10-01  8:34 UTC (permalink / raw)
  To: Michael Chan, Matt Carlson
  Cc: netdev@vger.kernel.org, sander.contrib, David S. Miller


A friend of mine is experiencing problems with his tg3 based NIC.  He is
experiencing the net stops working (transmit timed out), and he hade to
access the console to get it working again.

Kernel: 2.6.26-2-686 (standard Debian package)
OS: Debian Lenny 5.0 (all upgrades)

Ethernet controller: Broadcom Corporation NetXtreme BCM5700 Gigabit Ethernet (rev 12)
 Subsystem: Dell Broadcom BCM5700
 eth1: Tigon3 [partno(none) rev 7102 PHY(5401)]

Is this a known issue? (If so what kernel is it fixed in... that I can
make him test...)

Cite:
According to the kernel log the tg3 driver tries to reset it self.
However, even though it looks like the interface is up, it is not!

A manuel ifdown eth1 && ifup eth1 does the trick.

According to my rtorrent I had used about 4GB of traffic (combined
down/up)..  so a qualified guess could be a 32-bit limitation in the
tg3-driver?


Server specs:
 DELL PowerEdge 2550
 2 GB Ram
 2x1 Ghz Pentium III (Coppermine)


Sep 30 11:45:46 samurai kernel: [1145615.063992] NETDEV WATCHDOG: eth1: transmit timed out
Sep 30 11:45:46 samurai kernel: [1145615.064028] tg3: eth1: transmit timed out, resetting
Sep 30 11:45:46 samurai kernel: [1145615.064052] tg3: DEBUG: MAC_TX_STATUS[00000008] MAC_RX_STATUS[00000008]
Sep 30 11:45:46 samurai kernel: [1145615.064078] tg3: DEBUG: RDMAC_STATUS[00000000] WDMAC_STATUS[00000000]
Sep 30 11:45:46 samurai kernel: [1145615.064119] ------------[ cut here]------------
Sep 30 11:45:46 samurai kernel: [1145615.064141] WARNING: at net/sched/sch_generic.c:222 dev_watchdog+0x8f/0xdc()
Sep 30 11:45:46 samurai kernel: [1145615.064174] Modules linked in: iptable_mangle iptable_nat nf_nat ipt_LOG nf_conntrack_ip
v4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables ipv6 dm_snapshot dm_mirror dm_log dm_mod loop parport_pc
 parport evdev psmouse snd_pcm snd_timer snd soundcore snd_page_alloc serio_raw pcspkr shpchp pci_hotplug i2c_piix4 i2c_core
button sworks_agp agpgart dcdbas ext3 jbd mbcache sg sd_mod ide_cd_mod cdrom ide_pci_generic serverworks ide_core floppy aacr
aid aic7xxx scsi_transport_spi ata_generic e100 ohci_hcd libata scsi_mod dock tg3 usbcore 8139cp 8139too mii thermal processo
r fan thermal_sys [last unloaded: scsi_wait_scan]
Sep 30 11:45:46 samurai kernel: [1145615.064517] Pid: 0, comm: swapper Not tainted 2.6.26-2-686 #1
Sep 30 11:45:46 samurai kernel: [1145615.064549]  [<c01225f3>] warn_on_slowpath+0x40/0x66
Sep 30 11:45:46 samurai kernel: [1145615.064594]  [<c0119160>] hrtick_start_fair+0xeb/0x12c
Sep 30 11:45:46 samurai kernel: [1145615.064635]  [<c0118926>] enqueue_task+0x52/0x5d
Sep 30 11:45:46 samurai kernel: [1145615.064663]  [<c011894c>] activate_task+0x1b/0x26
Sep 30 11:45:46 samurai kernel: [1145615.064690]  [<c011b6f3>] try_to_wake_up+0xe8/0xf1
Sep 30 11:45:46 samurai kernel: [1145615.064723]  [<c01319a9>] autoremove_wake_function+0xd/0x2d
Sep 30 11:45:46 samurai kernel: [1145615.064760]  [<c01184d1>] __wake_up_common+0x2e/0x58
Sep 30 11:45:46 samurai kernel: [1145615.064792]  [<c011a6bb>] __wake_up+0x29/0x39
Sep 30 11:45:46 samurai kernel: [1145615.064822]  [<c012f11f>] insert_work+0x58/0x5c
Sep 30 11:45:46 samurai kernel: [1145615.064849]  [<c012f40d>] __queue_work+0x1c/0x28
Sep 30 11:45:46 samurai kernel: [1145615.064876]  [<c012f468>] queue_work+0x33/0x3c
Sep 30 11:45:46 samurai kernel: [1145615.064903]  [<c0267035>] dev_watchdog+0x8f/0xdc
Sep 30 11:45:46 samurai kernel: [1145615.064930]  [<c01296d4>] run_timer_softirq+0x11a/0x17c
Sep 30 11:45:46 samurai kernel: [1145615.064960]  [<c0266fa6>] dev_watchdog+0x0/0xdc
Sep 30 11:45:46 samurai kernel: [1145615.064993]  [<c01265f5>] __do_softirq+0x66/0xd3
Sep 30 11:45:46 samurai kernel: [1145615.065022]  [<c01266a7>] do_softirq+0x45/0x53
Sep 30 11:45:46 samurai kernel: [1145615.065047]  [<c012695e>] irq_exit+0x35/0x67
Sep 30 11:45:46 samurai kernel: [1145615.065070]  [<c01101c9>] smp_apic_timer_interrupt+0x6b/0x76
Sep 30 11:45:46 samurai kernel: [1145615.065098]  [<c0102656>] default_idle+0x0/0x53
Sep 30 11:45:46 samurai kernel: [1145615.065127]  [<c0104364>] apic_timer_interrupt+0x28/0x30
Sep 30 11:45:46 samurai kernel: [1145615.065156]  [<c0102656>] default_idle+0x0/0x53
Sep 30 11:45:46 samurai kernel: [1145615.065189]  [<c0114d78>] native_safe_halt+0x2/0x3
Sep 30 11:45:46 samurai kernel: [1145615.065225]  [<c0102683>] default_idle+0x2d/0x53
Sep 30 11:45:46 samurai kernel: [1145615.065250]  [<c01025ce>] cpu_idle+0xab/0xcb
Sep 30 11:45:46 samurai kernel: [1145615.065291]  =======================
Sep 30 11:45:46 samurai kernel: [1145615.065311] ---[ end trace 0dbb94f68d53053b ]---
Sep 30 11:45:46 samurai kernel: [1145615.457820] tg3: tg3_stop_block timed out, ofs=2c00 enable_bit=2
Sep 30 11:45:46 samurai kernel: [1145615.557909] tg3: tg3_stop_block timed out, ofs=3400 enable_bit=2
Sep 30 11:45:46 samurai kernel: [1145615.657903] tg3: tg3_stop_block timed out, ofs=2400 enable_bit=2
Sep 30 11:45:46 samurai kernel: [1145615.758203] tg3: tg3_stop_block timed out, ofs=1800 enable_bit=2
Sep 30 11:45:47 samurai kernel: [1145615.858203] tg3: tg3_stop_block timed out, ofs=c00 enable_bit=2
Sep 30 11:45:47 samurai kernel: [1145615.958203] tg3: tg3_stop_block timed out, ofs=4800 enable_bit=2
Sep 30 11:45:47 samurai kernel: [1145616.089213] tg3: eth1: Link is down.
Sep 30 11:45:49 samurai kernel: [1145618.565251] tg3: eth1: Link is up at 100 Mbps, full duplex.
Sep 30 11:45:49 samurai kernel: [1145618.565288] tg3: eth1: Flow control is off for TX and off for RX.

Sep 30 14:02:09 samurai kernel: [1154721.802641] NETDEV WATCHDOG: eth1: transmit timed out
Sep 30 14:02:09 samurai kernel: [1154721.802679] tg3: eth1: transmit timed out, resetting
Sep 30 14:02:09 samurai kernel: [1154721.802702] tg3: DEBUG: MAC_TX_STATUS[00000008] MAC_RX_STATUS[00000008]
Sep 30 14:02:09 samurai kernel: [1154721.802729] tg3: DEBUG: RDMAC_STATUS[00000000] WDMAC_STATUS[00000000]
Sep 30 14:02:09 samurai kernel: [1154721.974663] tg3: tg3_stop_block timed out, ofs=1800 enable_bit=2
Sep 30 14:02:09 samurai kernel: [1154722.078613] tg3: tg3_stop_block timed out, ofs=4800 enable_bit=2
Sep 30 14:02:09 samurai kernel: [1154722.206614] tg3: eth1: Link is down.
Sep 30 14:02:11 samurai kernel: [1154724.209290] tg3: eth1: Link is up at 100 Mbps, full duplex.
Sep 30 14:02:11 samurai kernel: [1154724.209328] tg3: eth1: Flow control is off for TX and off for RX.

-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network developer
  Cand. Scient Datalog / MSc.
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer

lspci -vvv
01:08.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5700 Gigabit Ethernet (rev 12)
        Subsystem: Dell Broadcom BCM5700
        Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
        Status: Cap+ 66MHz+ UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort-<TAbort- <MAbort- >SERR- <PERR- INTx-
        Latency: 32 (16000ns min), Cache Line Size: 32 bytes
        Interrupt: pin A routed to IRQ 17
        Region 0: Memory at feb00000 (64-bit, non-prefetchable) [size=64K]
        Capabilities: [40] PCI-X non-bridge device
                Command: DPERE- ERO- RBC=512 OST=1
                Status: Dev=ff:1f.1 64bit+ 133MHz+ SCD- USC- DC=simple DMMRBC=512 DMOST=1 DMCRS=8 RSCEM- 266MHz- 533MHz-
        Capabilities: [48] Power Management version 2
                Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA PME(D0-,D1-,D2-,D3hot+,D3cold-)
                Status: D0 PME-Enable- DSel=0 DScale=1 PME-
        Capabilities: [50] Vital Product Data <?>
        Capabilities: [58] Message Signalled Interrupts: Mask- 64bit+ Queue=0/3 Enable-
                Address: da6771daee5b44a4  Data: 889a
        Kernel driver in use: tg3
        Kernel modules: tg3


ethtool -i eth1:
driver: tg3
version: 3.92.1
firmware-version:
bus-info: 0000:01:08.0

Sep 18 22:34:19 samurai kernel: [ 4.707217] eth1: Tigon3 [partno(none) rev 7102 PHY(5401)] (PCI:66MHz:64-bit) 10/100/1000B
ase-T Ethernet 00:06:5b:39:d3:4a
Sep 18 22:34:19 samurai kernel: [ 4.707217] eth1: RXcsums[1] LinkChgREG[1] MIirq[1] ASF[0] WireSpeed[0] TSOcap[0]
Sep 18 22:34:19 samurai kernel: [ 4.707217] eth1: dma_rwctrl[76ff000f] dma_mask[64-bit]

^ permalink raw reply

* Re: [PATCH 1/3] wireless: implement basic ethtool support for cfg80211 devices
From: Johannes Berg @ 2009-10-01  8:51 UTC (permalink / raw)
  To: John W. Linville
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Kalle Valo, Kalle Valo,
	Luis R. Rodriguez
In-Reply-To: <1254359942-3483-1-git-send-email-linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 609 bytes --]

On Wed, 2009-09-30 at 21:19 -0400, John W. Linville wrote:

> +		if (!dev->ethtool_ops)
> +			dev->ethtool_ops = &cfg80211_ethtool_ops;
>  		break;

I might go so far and do it unconditionally so we get consistent
functionality across things. OTOH, full-mac drivers might be able to
support more.

> +const struct ethtool_ops cfg80211_ethtool_ops = {
> +	.get_drvinfo = cfg80211_get_drvinfo,
> +	.get_link = ethtool_op_get_link,
> +};
> +
> +void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)

if you change the order, you can make the latter static

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Avi Kivity @ 2009-10-01  8:34 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm,
	linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze,
	alacrityvm-devel
In-Reply-To: <4AC3B9C6.5090408@gmail.com>

On 09/30/2009 10:04 PM, Gregory Haskins wrote:


>> A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work
>> over vbus.
>>      
> Binary compatibility with existing virtio drivers, while nice to have,
> is not a specific requirement nor goal.  We will simply load an updated
> KMP/MSI into those guests and they will work again.  As previously
> discussed, this is how more or less any system works today.  It's like
> we are removing an old adapter card and adding a new one to "uprev the
> silicon".
>    

Virtualization is about not doing that.  Sometimes it's necessary (when 
you have made unfixable design mistakes), but just to replace a bus, 
with no advantages to the guest that has to be changed (other 
hypervisors or hypervisorless deployment scenarios aren't).

>>   Further, non-shmem virtio can't work over vbus.
>>      
> Actually I misspoke earlier when I said virtio works over non-shmem.
> Thinking about it some more, both virtio and vbus fundamentally require
> shared-memory, since sharing their metadata concurrently on both sides
> is their raison d'être.
>
> The difference is that virtio utilizes a pre-translation/mapping (via
> ->add_buf) from the guest side.  OTOH, vbus uses a post translation
> scheme (via memctx) from the host-side.  If anything, vbus is actually
> more flexible because it doesn't assume the entire guest address space
> is directly mappable.
>
> In summary, your statement is incorrect (though it is my fault for
> putting that idea in your head).
>    

Well, Xen requires pre-translation (since the guest has to give the host 
(which is just another guest) permissions to access the data).  So 
neither is a superset of the other, they're just different.

It doesn't really matter since Xen is unlikely to adopt virtio.

> An interesting thing here is that you don't even need a fancy
> multi-homed setup to see the effects of my exit-ratio reduction work:
> even single port configurations suffer from the phenomenon since many
> devices have multiple signal-flows (e.g. network adapters tend to have
> at least 3 flows: rx-ready, tx-complete, and control-events (link-state,
> etc).  Whats worse, is that the flows often are indirectly related (for
> instance, many host adapters will free tx skbs during rx operations, so
> you tend to get bursts of tx-completes at the same time as rx-ready.  If
> the flows map 1:1 with IDT, they will suffer the same problem.
>    

You can simply use the same vector for both rx and tx and poll both at 
every interrupt.

> In any case, here is an example run of a simple single-homed guest over
> standard GigE.  Whats interesting here is that .qnotify to .notify
> ratio, as this is the interrupt-to-signal ratio.  In this case, its
> 170047/151918, which comes out to about 11% savings in interrupt injections:
>
> vbus-guest:/home/ghaskins # netperf -H dev
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> dev.laurelwood.net (192.168.1.10) port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
> 1048576  16384  16384    10.01     940.77
> vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge
>    .events                        : 170048
>    .qnotify                       : 151918
>    .qinject                       : 0
>    .notify                        : 170047
>    .inject                        : 18238
>    .bridgecalls                   : 18
>    .buscalls                      : 12
> vbus-guest:/home/ghaskins # cat /proc/interrupts
>              CPU0
>     0:         87   IO-APIC-edge      timer
>     1:          6   IO-APIC-edge      i8042
>     4:        733   IO-APIC-edge      serial
>     6:          2   IO-APIC-edge      floppy
>     7:          0   IO-APIC-edge      parport0
>     8:          0   IO-APIC-edge      rtc0
>     9:          0   IO-APIC-fasteoi   acpi
>    10:          0   IO-APIC-fasteoi   virtio1
>    12:         90   IO-APIC-edge      i8042
>    14:       3041   IO-APIC-edge      ata_piix
>    15:       1008   IO-APIC-edge      ata_piix
>    24:     151933   PCI-MSI-edge      vbus
>    25:          0   PCI-MSI-edge      virtio0-config
>    26:        190   PCI-MSI-edge      virtio0-input
>    27:         28   PCI-MSI-edge      virtio0-output
>   NMI:          0   Non-maskable interrupts
>   LOC:       9854   Local timer interrupts
>   SPU:          0   Spurious interrupts
>   CNT:          0   Performance counter interrupts
>   PND:          0   Performance pending work
>   RES:          0   Rescheduling interrupts
>   CAL:          0   Function call interrupts
>   TLB:          0   TLB shootdowns
>   TRM:          0   Thermal event interrupts
>   THR:          0   Threshold APIC interrupts
>   MCE:          0   Machine check exceptions
>   MCP:          1   Machine check polls
>   ERR:          0
>   MIS:          0
>
> Its important to note here that we are actually looking at the interrupt
> rate, not the exit rate (which is usually a multiple of the interrupt
> rate, since you have to factor in as many as three exits per interrupt
> (IPI, window, EOI).  Therefore we saved about 18k interrupts in this 10
> second burst, but we may have actually saved up to 54k exits in the
> process. This is only over a 10 second window at GigE rates, so YMMV.
> These numbers get even more dramatic on higher end hardware, but I
> haven't had a chance to generate new numbers yet.
>    

(irq window exits should only be required on a small percentage of 
interrupt injections, since the guest will try to disable interrupts for 
short periods only)

> Looking at some external stats paints an even bleaker picture: "exits"
> as reported by kvm_stat for virtio-pci based virtio-net tip the scales
> at 65k/s vs 36k/s for vbus based venet.  And virtio is consuming ~30% of
> my quad-core's cpu, vs 19% for venet during the test.  Its hard to know
> which innovation or innovations may be responsible for the entire
> reduction, but certainly the interrupt-to-signal ratio mentioned above
> is probably helping.
>    

Can you please stop comparing userspace-based virtio hosts to 
kernel-based venet hosts?  We know the userspace implementation sucks.

> The even worse news for 1:1 models is that the ratio of
> exits-per-interrupt climbs with load (exactly when it hurts the most)
> since that is when the probability that the vcpu will need all three
> exits is the highest.
>    

Requiring all three exits means the guest is spending most of its time 
with interrupts disabled; that's unlikely.

Thanks for the numbers.  Are those 11% attributable to rx/tx 
piggybacking from the same interface?

Also, 170K interupts -> 17K interrupts/sec -> 55kbit/interrupt -> 
6.8kB/interrupt.  Ignoring interrupt merging and assuming equal rx/tx 
distribution, that's about 13kB/interrupt.  Seems rather low for a 
saturated link.

>>      
>>> and priortizable/nestable signals.
>>>
>>>        
>> That doesn't belong in a bus.
>>      
> Everyone is of course entitled to an opinion, but the industry as a
> whole would disagree with you.  Signal path routing (1:1, aggregated,
> etc) is at the discretion of the bus designer.  Most buses actually do
> _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc).
>    

With standard PCI, they do not.  But all modern host adapters support 
MSI and they will happily give you one interrupt per queue.

> PCI is somewhat of an outlier in that regard afaict.  Its actually a
> nice feature of PCI when its used within its design spec (HW).  For
> SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling"
> issue in the signal path I mentioned above.  This is one of the many
> reasons I think PCI is not the best choice for PV.
>    

Look at the vmxnet3 submission (recently posted on virtualization@).  
It's a perfectly ordinary PCI NIC driver, apart from having so many 'V's 
in the code.  16 rx queues, 8 tx queues, 25 MSIs, BARs for the 
registers.  So while the industry as a whole might disagree with me, it 
seems VMware does not.


>>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png
>>>
>>>        
>> That's a red herring.  The problem is not with virtio as an ABI, but
>> with its implementation in userspace.  vhost-net should offer equivalent
>> performance to vbus.
>>      
> That's pure speculation.  I would advise you to reserve such statements
> until after a proper bakeoff can be completed.

Let's do that then.  Please reserve the corresponding comparisons from 
your side as well.

> This is not to mention
> that vhost-net does nothing to address our other goals, like scheduler
> coordination and non-802.x fabrics.
>    

What are scheduler coordination and non-802.x fabrics?

>> Right, when you ignore the points where they don't fit, it's a perfect
>> mesh.
>>      
> Where doesn't it fit?
>    

(avoiding infinite loop)

>>>> But that's not a strong argument for vbus; instead of adding vbus you
>>>> could make virtio more friendly to non-virt
>>>>
>>>>          
>>> Actually, it _is_ a strong argument then because adding vbus is what
>>> helps makes virtio friendly to non-virt, at least for when performance
>>> matters.
>>>
>>>        
>> As vhost-net shows, you can do that without vbus
>>      
> Citation please.  Afaict, the one use case that we looked at for vhost
> outside of KVM failed to adapt properly, so I do not see how this is true.
>    

I think Ira said he can make vhost work?

>> and without breaking compatibility.
>>      
> Compatibility with what?  vhost hasn't even been officially deployed in
> KVM environments afaict, nevermind non-virt.  Therefore, how could it
> possibly have compatibility constraints with something non-virt already?
>   Citation please.
>    

virtio-net over pci is deployed.  Replacing the backend with vhost-net 
will require no guest modifications.  Replacing the frontend with venet 
or virt-net/vbus-pci will require guest modifications.

Obviously virtio-net isn't deployed in non-virt.  But if we adopt vbus, 
we have to migrate guests.



>> Of course there is such a thing as native, a pci-ready guest has tons of
>> support built into it
>>      
> I specifically mentioned that already ([1]).
>
> You are also overstating its role, since the basic OS is what implements
> the native support for bus-objects, hotswap, etc, _not_ PCI.  PCI just
> rides underneath and feeds trivial events up, as do other bus-types
> (usb, scsi, vbus, etc).

But we have to implement vbus for each guest we want to support.  That 
includes Windows and older Linux which has a different internal API, so 
we have to port the code multiple times, to get existing functionality.

> And once those events are fed, you still need a
> PV layer to actually handle the bus interface in a high-performance
> manner so its not like you really have a "native" stack in either case.
>    

virtio-net doesn't use any pv layer.

>> that doesn't need to be retrofitted.
>>      
> No, that is incorrect.  You have to heavily modify the pci model with
> layers on top to get any kind of performance out of it.  Otherwise, we
> would just use realtek emulation, which is technically the native PCI
> you are apparently so enamored with.
>    

virtio-net doesn't modify the PCI model.  And if you look at vmxnet3, 
they mention that it conforms to somthing called UPT, which allows 
hardware vendors to implement parts of their NIC model.  So vmxnet3 is 
apparently suitable to both hardware and software implementations.

> Not to mention there are things you just plain can't do in PCI today,
> like dynamically assign signal-paths,

You can have dynamic MSI/queue routing with virtio, and each MSI can be 
routed to a vcpu at will.

> priority, and coalescing, etc.
>    

Do you mean interrupt priority?  Well, apic allows interrupt priorities 
and Windows uses them; Linux doesn't.  I don't see a reason to provide 
more than native hardware.

>> Since
>> practically everyone (including Xen) does their paravirt drivers atop
>> pci, the claim that pci isn't suitable for high performance is incorrect.
>>      
> Actually IIUC, I think Xen bridges to their own bus as well (and only
> where they have to), just like vbus.  They don't use PCI natively.  PCI
> is perfectly suited as a bridge transport for PV, as I think the Xen and
> vbus examples have demonstrated.  Its the 1:1 device-model where PCI has
> the most problems.
>    

N:1 breaks down on large guests since one vcpu will have to process all 
events.  You could do N:M, with commands to change routings, but where's 
your userspace interface?  you can't tell from /proc/interrupts which 
vbus interupts are active, and irqbalance can't steer them towards less 
busy cpus since they're invisible to the interrupt controller.


>>> And lastly, why would you _need_ to use the so called "native"
>>> mechanism?  The short answer is, "you don't".  Any given system (guest
>>> or bare-metal) already have a wide-range of buses (try running "tree
>>> /sys/bus" in Linux).  More importantly, the concept of adding new buses
>>> is widely supported in both the Windows and Linux driver model (and
>>> probably any other guest-type that matters).  Therefore, despite claims
>>> to the contrary, its not hard or even unusual to add a new bus to the
>>> mix.
>>>
>>>        
>> The short answer is "compatibility".
>>      
> There was a point in time where the same could be said for virtio-pci
> based drivers vs realtek and e1000, so that argument is demonstrably
> silly.  No one tried to make virtio work in a binary compatible way with
> realtek emulation, yet we all survived the requirement for loading a
> virtio driver to my knowledge.
>    

The larger your installed base, the more difficult it is.  Of course 
it's doable, but I prefer not doing it and instead improving things in a 
binary backwards compatible manner.  If there is no choice we will bow 
to the inevitable and make our users upgrade.  But at this point there 
is a choice, and I prefer to stick with vhost-net until it is proven 
that it won't work.

> The bottom line is: Binary device compatibility is not required in any
> other system (as long as you follow sensible versioning/id rules), so
> why is KVM considered special?
>    

One of the benefits of virtualization is that the guest model is 
stable.  You can live-migrate guests and upgrade the hardware 
underneath.  You can have a single guest image that you clone to 
provision new guests.  If you switch to a new model, you give up those 
benefits, or you support both models indefinitely.

Note even hardware nowadays is binary compatible.  One e1000 driver 
supports a ton of different cards, and I think (not sure) newer cards 
will work with older drivers, just without all their features.

> The fact is, it isn't special (at least not in this regard).  What _is_
> required is "support" and we fully intend to support these proposed
> components.  I assure you that at least the users that care about
> maximum performance will not generally mind loading a driver.  Most of
> them would have to anyway if they want to get beyond realtek emulation.
>    

For a new install, sure.  I'm talking about existing deployments (and 
those that will exist by the time vbus is ready for roll out).

> I am certainly in no position to tell you how to feel, but this
> declaration would seem from my perspective to be more of a means to an
> end than a legitimate concern.  Otherwise we would never have had virtio
> support in the first place, since it was not "compatible" with previous
> releases.
>    

virtio was certainly not pain free, needing Windows drivers, updates to 
management tools (you can't enable it by default, so you have to offer 
it as a choice), mkinitrd, etc.  I'd rather not have to go through that 
again.

>>   Especially if the device changed is your boot disk.
>>      
> If and when that becomes a priority concern, that would be a function
> transparently supported in the BIOS shipped with the hypervisor, and
> would thus be invisible to the user.
>    

No, you have to update the driver in your initrd (for Linux) or properly 
install the new driver (for Windows).  It's especially difficult for 
Windows.

>>   You may not care about the pain caused to users, but I do, so I will
>> continue to insist on compatibility.
>>      
> For the users that don't care about maximum performance, there is no
> change (and thus zero pain) required.  They can use realtek or virtio if
> they really want to.  Neither is going away to my knowledge, and lets
> face it: 2.6Gb/s out of virtio to userspace isn't *that* bad.  But "good
> enough" isn't good enough, and I won't rest till we get to native
> performance.

I don't want to support both virtio and vbus in parallel.  There's 
enough work already.  If we adopt vbus, we'll have to deprecate and 
eventually kill off virtio.

> 2) True pain to users is not caused by lack of binary compatibility.
> Its caused by lack of support.  And its a good thing or we would all be
> emulating 8086 architecture forever...
>
> ..oh wait, I guess we kind of do that already ;).  But at least we can
> slip in something more advanced once in a while (APIC vs PIC, USB vs
> uart, iso9660 vs floppy, for instance) and update the guest stack
> instead of insisting it must look like ISA forever for compatibility's sake.
>    

PCI is continuously updated, with MSI, MSI-X, and IOMMU support being 
some recent updates.  I'd like to ride on top of that instead of having 
to clone it for every guest I support.

>> So we have: vbus needs a connector, vhost needs a connector.  vbus
>> doesn't need userspace to program the addresses (but does need userspace
>> to instantiate the devices and to program the bus address decode)
>>      
> First of all, bus-decode is substantially easier than per-device decode
> (you have to track all those per-device/per-signal fds somewhere,
> integrate with hotswap, etc), and its only done once per guest at
> startup and left alone.  So its already not apples to apples.
>    

Right, it means you can hand off those eventfds to other qemus or other 
pure userspace servers.  It's more flexible.

> Second, while its true that the general kvm-connector bus-decode needs
> to be programmed,  that is a function of adapting to the environment
> that _you_ created for me.  The original kvm-connector was discovered
> via cpuid and hypercalls, and didn't need userspace at all to set it up.
>   Therefore it would be entirely unfair of you to turn around and somehow
> try to use that trait of the design against me since you yourself
> imposed it.
>    

No kvm feature will ever be exposed to a guest without userspace 
intervention.  It's a basic requirement.  If it causes complexity (and 
it does) we have to live with it.

>>   Does it work on Windows?
>>      
> This question doesn't make sense.  Hotswap control occurs on the host,
> which is always Linux.
>
> If you were asking about whether a windows guest will support hotswap:
> the answer is "yes".  Our windows driver presents a unique PDO/FDO pair
> for each logical device instance that is pushed out (just like the built
> in usb, pci, scsi bus drivers that windows supports natively).
>    

Ah, you have a Windows venet driver?


>>> As an added bonus, its device-model is modular.  A developer can write a
>>> new device model, compile it, insmod it to the host kernel, hotplug it
>>> to the running guest with mkdir/ln, and the come back out again
>>> (hotunplug with rmdir, rmmod, etc).  They may do this all without taking
>>> the guest down, and while eating QEMU based IO solutions for breakfast
>>> performance wise.
>>>
>>> Afaict, qemu can't do either of those things.
>>>
>>>        
>> We've seen that herring before,
>>      
> Citation?
>    

It's the compare venet-in-kernel to virtio-in-userspace thing again.  
Let's defer that until mst complete vhost-net mergable buffers, it which 
time we can compare vhost-net to venet and see how much vbus contributes 
to performance and how much of it comes from being in-kernel.

>>>> Refactor instead of duplicating.
>>>>
>>>>          
>>> There is no duplicating.  vbus has no equivalent today as virtio doesn't
>>> define these layers.
>>>
>>>        
>> So define them if they're missing.
>>      
> I just did.
>    

Since this is getting confusing to me, I'll start from scratch looking 
at the vbus layers, top to bottom:

Guest side:
1. venet guest kernel driver - AFAICT, duplicates the virtio-net guest 
driver functionality
2. vbus guest driver (config and hotplug) - duplicates pci, or if you 
need non-pci support, virtio config and its pci bindings; needs 
reimplementation for all supported guests
3. vbus guest driver (interrupt coalescing, priority) - if needed, 
should be implemented as an irqchip (and be totally orthogonal to the 
driver); needs reimplementation for all supported guests
4. vbus guest driver (shm/ioq) - finder grained layering than virtio 
(which only supports the combination, due to the need for Xen support); 
can be retrofitted to virtio at some cost

Host side:
1. venet host kernel driver - is duplicated by vhost-net; doesn't 
support live migration, unprivileged users, or slirp
2. vbus host driver (config and hotplug) - duplicates pci support in 
userspace (which will need to be kept in any case); already has two 
userspace interfaces
3. vbus host driver (interrupt coalescing, priority) - if we think we 
need it (and I don't), should be part of kvm core, not a bus
4. vbus host driver (shm) - partially duplicated by vhost memory slots
5. vbus host driver (ioq) - duplicates userspace virtio, duplicated by vhost

>>> There is no rewriting.  vbus has no equivalent today as virtio doesn't
>>> define these layers.
>>>
>>> By your own admission, you said if you wanted that capability, use a
>>> library.  What I think you are not understanding is vbus _is_ that
>>> library.  So what is the problem, exactly?
>>>
>>>        
>> It's not compatible.
>>      
> No, that is incorrect.  What you are apparently not understanding is
> that not only is vbus that library, but its extensible.  So even if
> compatibility is your goal (it doesn't need to be IMO) it can be
> accommodated by how you interface to the library.
>    

To me, compatible means I can live migrate an image to a new system 
without the user knowing about the change.  You'll be able to do that 
with vhost-net.

>>>>
>>>>          
>>> No, it does not.  vbus just needs a relatively simple single message
>>> pipe between the guest and host (think "hypercall tunnel", if you will).
>>>
>>>        
>> That's ioeventfd.  So far so similar.
>>      
> No, that is incorrect.  For one, vhost uses them on a per-signal path
> basis, whereas vbus only has one channel for the entire guest->host.
>    

You'll probably need to change that as you start running smp guests.

> Second, I do not use ioeventfd anymore because it has too many problems
> with the surrounding technology.  However, that is a topic for a
> different thread.
>    

Please post your issues.  I see ioeventfd/irqfd as critical kvm interfaces.

>> vbus devices aren't magically instantiated.  Userspace needs to
>> instantiate them too.  Sure, there's less work on the host side since
>> you're using vbus instead of the native interface, but more work on the
>> guest side since you're using vbus instead of the native interface.
>>      
>
> No, that is incorrect.  The amount of "work" that a guest does is
> actually the same in both cases, since the guest OS peforms the hotswap
> handling natively for all bus types (at least for Linux and Windows).
> You still need to have a PV layer to interface with those objects in
> both cases, as well, so there is no such thing as "native interface" for
> PV.  Its only a matter of where it occurs in the stack.
>    

I'm missing something.  Where's the pv layer for virtio-net?

Linux drivers have an abstraction layer to deal with non-pci.  But the 
Windows drivers are ordinary pci drivers with nothing that looks 
pv-ish.  You could implement virtio-net hardware if you wanted to.

>>   non-privileged-user capable?
>>      
> The short answer is "not yet (I think)".  I need to write a patch to
> properly set the mode attribute in sysfs, but I think this will be trivial.
>
>    

(and selinux label)

>> Ah, so you have two control planes.
>>      
> So what?  If anything, it goes to show how extensible the framework is
> that a new plane could be added in 119 lines of code:
>
> ~/git/linux-2.6>  stg show vbus-add-admin-ioctls.patch | diffstat
>   Makefile       |    3 -
>   config-ioctl.c |  117
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 119 insertions(+), 1 deletion(-)
>
> if and when having two control planes exceeds its utility, I will submit
> a simple patch that removes the useless one.
>    

It always begins with a 119-line patch and then grows, that's life.

>> kvm didn't have an existing counterpart in Linux when it was
>> proposed/merged.
>>      
> And likewise, neither does vbus.
>
>    

For virt uses, I don't see the need.  For non-virt, I have no opinion.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 30/34] move virtnet_remove to .devexit.text
From: Uwe Kleine-König @ 2009-10-01  8:28 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sam Ravnborg, Andrew Morton, David S. Miller, Rusty Russell,
	Alex Williamson, Mark McLoughlin, netdev
In-Reply-To: <20091001082607.GA2181@pengutronix.de>

The function virtnet_remove is used only wrapped by __devexit_p so define
it using __devexit.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alex Williamson <alex.williamson@hp.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/virtio_net.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d445845..8d00976 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -948,7 +948,7 @@ free:
 	return err;
 }
 
-static void virtnet_remove(struct virtio_device *vdev)
+static void __devexit virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
 	struct sk_buff *skb;
-- 
1.6.4.3

^ permalink raw reply related

* [PATCH 22/34] don't use __devexit_p to wrap sgiseeq_remove
From: Uwe Kleine-König @ 2009-10-01  8:28 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sam Ravnborg, Andrew Morton, David S. Miller, Wang Chen,
	Ralf Baechle, Patrick McHardy, netdev
In-Reply-To: <20091001082607.GA2181@pengutronix.de>

The function sgiseeq_remove is defined using __exit, so don't use
__devexit_p but __exit_p to wrap it.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Wang Chen <wangchen@cn.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/sgiseeq.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/sgiseeq.c b/drivers/net/sgiseeq.c
index ecf3279..f4dfd1f 100644
--- a/drivers/net/sgiseeq.c
+++ b/drivers/net/sgiseeq.c
@@ -826,7 +826,7 @@ static int __exit sgiseeq_remove(struct platform_device *pdev)
 
 static struct platform_driver sgiseeq_driver = {
 	.probe	= sgiseeq_probe,
-	.remove	= __devexit_p(sgiseeq_remove),
+	.remove	= __exit_p(sgiseeq_remove),
 	.driver = {
 		.name	= "sgiseeq",
 		.owner	= THIS_MODULE,
-- 
1.6.4.3

^ permalink raw reply related

* [PATCH 13/34] don't use __devexit_p to wrap meth_remove
From: Uwe Kleine-König @ 2009-10-01  8:28 UTC (permalink / raw)
  To: linux-kernel
  Cc: Sam Ravnborg, Andrew Morton, David S. Miller, Ralf Baechle,
	Patrick McHardy, Johannes Berg, netdev
In-Reply-To: <20091001082607.GA2181@pengutronix.de>

The function meth_remove is defined using __exit, so don't use __devexit_p
but __exit_p to wrap it.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/meth.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/meth.c b/drivers/net/meth.c
index 92ceb68..2af8173 100644
--- a/drivers/net/meth.c
+++ b/drivers/net/meth.c
@@ -828,7 +828,7 @@ static int __exit meth_remove(struct platform_device *pdev)
 
 static struct platform_driver meth_driver = {
 	.probe	= meth_probe,
-	.remove	= __devexit_p(meth_remove),
+	.remove	= __exit_p(meth_remove),
 	.driver = {
 		.name	= "meth",
 		.owner	= THIS_MODULE,
-- 
1.6.4.3

^ permalink raw reply related

* Re: [PATCH 1/2] net/netfilter/ipvs: Move #define KMSG_COMPONENT to Makefile
From: Jan Engelhardt @ 2009-10-01  8:27 UTC (permalink / raw)
  To: Joe Perches
  Cc: Patrick McHardy, David S. Miller, Simon Horman, Julian Anastasov,
	Netfilter Developer Mailing List, netdev,
	Linux Kernel Mailing List, lvs-devel
In-Reply-To: <1254358235.2960.145.camel@Joe-Laptop.home>

On Thursday 2009-10-01 02:50, Joe Perches wrote:
>On Thu, 2009-10-01 at 02:31 +0200, Jan Engelhardt wrote:
>> Well I personally prefer the #include instead of hiding such in 
>> Makefiles. You know, when newcomers could start doing `grep 
>> KMSG_COMPONENT *.[ch]`. Perhaps GCC's -include flag in a Makefile
>> to avoid #includes in .c files?
>
>I imagine an eventual goal of standardizing the default
>pr_fmt define in kernel.h to
>
>	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>
>so that all pr_<level> calls get this unless otherwise
>specified.

I like that approach. Saves me adding that line to .c
files repeatedly.

>Or perhaps better, to get rid of pr_fmt(fmt) altogether and
>have printk emit the filename/modulename, function and/or
>code offset by using something like %pS after the level.

I object to that. You would be spamming the dmesg ring buffer
with all that info, plus

filename: you would have to keep filename strings in the kernel.
Surely I do not find that thrilling when there are ~18000
non-arch .[ch] files whose pathnames amount to 542K.
Same goes similar for functions.

modulename: obj-y files would only get "<built-in>" or something
for KBUILD_MODNAME. Printing that to dmesg is not too useful.

I would rather keep plain printk as-is.

^ permalink raw reply

* Re: [net-2.6 PATCH] ixgbe: correct the parameter description
From: David Miller @ 2009-10-01  8:10 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, jpirko, peter.p.waskiewicz.jr
In-Reply-To: <20091001065140.13279.42634.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed, 30 Sep 2009 23:51:41 -0700

> From: Jiri Pirko <jpirko@redhat.com>
> 
> ccffad25b5136958d4769ed6de5e87992dd9c65c changed parameters for function
> ixgbe_update_uc_addr_list_generic but parameter description was not updated.
> This patch corrects it.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> Acked-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied, thanks.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox